This code was used for the analysis of Cariaco Basin metagenomes and metatranscriptomes for the 2023 paper by Geller-McGrath et al. Some sections of this analysis were done on a compute node of the Poseidon High Performance Computing (HPC) cluster based at the Woods Hole Oceanographic Institute (WHOI). All of the analyses done here using R can be run locally; HPC processing was used to speed up computations.
library(tidyverse)
library(ComplexHeatmap)
library(readxl)
library(umap)
library(cowplot)
library(dbscan)
library(ggtext) # used for ggplots
library(glue) # used for ggplots
library(cowplot) # for combining various plots together in grids
library(gridtext) # for editing text in heatmap plots
library(ComplexHeatmap) # for heatmap plots
library(pheatmap) # for heatmap plots
library(janitor)
library(ggfortify)
library(grid)
# BGC frequency plot by phylum --------------------------------------------
bgc_count_data_summary = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/bgc_count_data_summary.rds')
smc = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/smc.rds')
highlight <- function(x, pat, color = "black", family = "") {
ifelse(grepl(pat, x), glue("<b style='font-family:{family}; color:{color}'>{x}</b>"), x)
}
face2 = bgc_count_data_summary %>%
select(phylum, domain, n_mags_in_phylum) %>%
distinct() %>%
filter(n_mags_in_phylum == 1) %>%
mutate(type = 'bold')
bact_final_sm_freq_plot = bgc_count_data_summary %>%
filter(domain == 'Bacteria') %>%
ggplot(aes(as_factor(phylum), norm_sm_freq,
fill = fct_rev(fct_infreq(product)))) +
geom_bar(stat = 'identity') +
theme_bw() +
theme(axis.text.x = element_text(angle = 70, hjust = 1, size = 10),
axis.text.y = element_text(size = 10),
axis.title = element_text(size = 12),
legend.position = c(0.98, 0.98),
legend.justification = c(0.98, 0.98),
legend.text = element_text(size = 11),
legend.title = element_text(size = 12),
legend.background = element_blank(), panel.border =
element_rect(color = 'black', fill = NA),
legend.box.background = element_rect(color = 'black'),
panel.grid = element_blank(),
strip.text = element_text(size = 13),
axis.title.y = element_blank(),
strip.background = element_rect(fill = 'wheat1')
) +
labs(x = '\nPhylum',
fill = 'Secondary metabolite class') +
scale_fill_manual(values = smc) +
facet_grid(~ domain, space = 'free', scales = 'free') +
scale_x_discrete(labels = function(x) highlight(x, paste0(face2$phylum, collapse = '|'), 'black')) +
theme(axis.text.x = element_markdown())# +
arch_final_sm_freq_plot = bgc_count_data_summary %>%
filter(domain == 'Archaea') %>%
ggplot(aes(as_factor(phylum), norm_sm_freq,
fill = fct_rev(fct_infreq(product)))) +
geom_bar(stat = 'identity') +
theme_bw() +
theme(axis.text.x = element_text(angle = 70, hjust = 1),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12),
legend.position = 'none',
panel.grid = element_blank(),
strip.text = element_text(size = 13),
axis.title.y = element_blank(),
strip.background = element_rect(fill = 'wheat1')
) +
labs(x = '\nPhylum',
fill = 'Secondary metabolite class') +
scale_fill_manual(values = smc) +
facet_grid(~ domain, space = 'free', scales = 'free') +
scale_x_discrete(labels = function(x) highlight(x, paste0(face2$phylum, collapse = '|'), 'black')) +
theme(axis.text.x = element_markdown())
# save the sm biosynthetic potential plot
sm_freq_plot = plot_grid(bact_final_sm_freq_plot, NULL, arch_final_sm_freq_plot,
labels = c('a', '', 'b'),
rel_widths = c(3, 0.02, 1),
nrow = 1,
align = 'hv'#,
#label_x = c(0.0975, 0, 0)
)
sm_freq_plot
mm2_final = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/logTransformed_tpm_for_bgc_genes_minimap2_res.rds')
metaT_col_order = tibble(
sample_name = colnames(mm2_final)[2:ncol(mm2_final)],
fraction = if_else(
str_detect(sample_name, 'PA'), 'PA', 'FL'),
depth = str_replace(
sample_name, '.*(\\d{3}).*', '\\1')
) %>%
arrange(desc(fraction), depth) %>%
pull(sample_name)
# create color specifications for column annotations
anno_colors = list(
Fraction = c(
'PA' = 'darkgreen',
'FL' = 'darkseagreen2'),
Layer = c(
'Oxycline' = 'darkslategray3', #'wheat2',
'Shallow anoxic' = 'gold',# 'khaki2',
'Euxinic' = 'sienna3'),
Season = c('May' = 'slategray',
'November' = 'slategray1'),
`Depth (m)` = c(
'103' = 'grey90',
'148' = 'grey85',
'198' = 'grey80',
'200' = 'grey75',
'234' = 'grey70',
'237' = 'grey65',
'247' = 'grey60',
'267' = 'grey55',
'295' = 'grey50',
'314' = 'grey45',
'900' = 'grey20'
)
)
# final plot used in manuscript
mm2_final_mod =
mm2_final %>%
mutate(group_col = str_replace(
gene_name, '(MAG_\\d+-ctg\\d+)_.*', '\\1'), .before = 1) %>%
relocate(group_col) %>%
mutate(rowSums = rowSums(.[3:ncol(.)]), .after = 1) %>%
mutate(rs_pa = rowSums(.[str_detect(colnames(.), 'PA')]),
rs_fl = rowSums(.[str_detect(colnames(.), 'FL')])) %>%
relocate(c(rs_pa, rs_fl), .after = 1) %>%
filter(
rowSums > 19
) %>%
group_by(group_col) %>%
mutate(group_PA_rowSums = sum(rs_pa), .after = 1) %>%
mutate(group_FL_rowSums = sum(rs_fl), .after = 1) %>%
mutate(sorting_col = if_else(
group_PA_rowSums > group_FL_rowSums, 'PA', 'FL'), .after = group_PA_rowSums) %>%
ungroup() %>%
arrange(desc(group_PA_rowSums), sorting_col) %>%
select(-c(rowSums, group_PA_rowSums, group_FL_rowSums,
rs_pa, rs_fl, group_col, sorting_col)) %>%
column_to_rownames(var = 'gene_name') %>%
relocate(all_of(metaT_col_order))
mm2_final_mod =
mm2_final_mod %>%
mutate(pa_sums = rowSums(.[str_detect(colnames(.), 'PA')]),
fl_sums = rowSums(.[str_detect(colnames(.), 'FL')]) ) %>%
relocate(c(pa_sums, fl_sums), .before = 1) %>%
mutate(row_sorter = if_else(pa_sums > fl_sums, TRUE, FALSE), .before = 1) %>%
arrange(desc(row_sorter), desc(pa_sums)) %>%
select(-c(pa_sums, fl_sums, row_sorter)) %>%
relocate(colnames(.) %>%
{function(x) {
tbl = tibble(
mpa = x[str_detect(x, 'MPA')],
npa = x[str_detect(x, 'NPA')],
mfl = x[str_detect(x, 'MFL')],
nfl = x[str_detect(x, 'NFL')]
) %>%
mutate(groupper = rep(paste0('group_', seq(1, 6, by = 1)), each = 2)) %>%
group_by(groupper) %>%
summarize(across(everything(), ~
paste0(.x, collapse = ';')),
.groups = 'drop') %>%
select(-groupper)
pa = as.vector(rbind(tbl$mpa, tbl$npa))
fl = as.vector(rbind(tbl$mfl, tbl$nfl))
g = c(pa, fl)
g = str_split(g, pattern = ';') %>% unlist()
return(g)}}())
metaT_row_anno = tibble(
gene_name = rownames(mm2_final_mod),
mag_name = str_replace(gene_name, '(MAG_\\d+)-.*', '\\1')
) %>%
column_to_rownames(var = 'gene_name')
# create column annotation information object
col_anno = tibble(
sample_name = colnames(mm2_final_mod),
Depth = str_replace(sample_name, '.*(\\d{3}).*', '\\1'),
Fraction = if_else(str_detect(sample_name, 'PA'), 'PA', 'FL'),
Season = if_else(str_detect(sample_name, 'M'), 'May', 'November')
) %>%
mutate(Depth = as.integer(Depth),
Layer = case_when(
Depth < 247 ~ 'Oxycline',
Depth >= 247 & Depth < 900 ~ 'Shallow anoxic',
TRUE ~ 'Euxinic'
)) %>%
mutate(Depth = as.character(Depth)) %>%
rename(`Depth (m)` = Depth) %>%
column_to_rownames(var = 'sample_name')
mm2_final_plot =
mm2_final_mod %>%
rownames_to_column(var = 'gene_name') %>%
mutate(mag_name = str_replace(
gene_name, '(MAG_\\d+)-.*', '\\1')) %>%
filter(!mag_name %in% c('MAG_1507', 'MAG_769', 'MAG_983')) %>%
select(-mag_name) %>%
column_to_rownames(var = 'gene_name') %>%
as.matrix() %>%
ComplexHeatmap::pheatmap(
name = 'Transcripts per million',
show_colnames = FALSE,
annotation_col = col_anno,
#annotation_row = metaT_row_anno,
annotation_colors = anno_colors,
gaps_col = 24,
clustering_method = 'mcquitty',
#cluster_rows = FALSE,
cluster_cols = FALSE,
show_rownames = FALSE,
border_color = 'grey60',
treeheight_row = 0,
treeheight_col = 0,
color =
c(#colorRampPalette(colors = 'white')(100),
colorRampPalette(colors = c(
'gray98',
'#aae6fa',
'#c9e6f0',
'#f7e96d',
'#f7e43b',
'goldenrod1',
'orange',
'#ff6a00',
'#D73027',
'purple4'))(1000)
),
legend_breaks = seq(0, 6, by = 2),
legend_labels = gt_render(c(
'0',
'6.39 x 10^0',
'5.36 x 10^1',
'4.02 x 10^2')),
show_row_dend = FALSE,
show_column_dend = FALSE
)
mm2_final_plot = ComplexHeatmap::draw(
mm2_final_plot,
merge_legend = TRUE,
heatmap_legend_side = 'right',
annotation_legend_side = 'right'
)
# Metatranscriptome UMAP analysis ----------------------------------------------------
setwd('~/Documents/cariaco-tidy-analysis/')
rnaseq_mapping_counts = readRDS('rdata-files/rnaseq_read_mapping_minimap2_on_cariaco_genes.rds')
bgc_genes_10kb = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/bgc_genes_10kb.rds')
mm2_final = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/logTransformed_tpm_for_bgc_genes_minimap2_res.rds')
col_metadata = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/col_metadata.rds')
rnaseq_umap_tbl = mm2_final |>
filter(gene_name %in% bgc_genes_10kb$gene_name) |>
mutate(contig = str_replace(gene_name, '(MAG_\\d+)-ctg(.*)_\\d+', '\\1_\\2'),
mag_name = str_replace(contig, '(MAG_\\d+)_.*', '\\1')) |>
relocate(c(mag_name, contig), .before = 1) |>
select(-gene_name) |>
group_by(mag_name, contig) |>
summarize(across(everything(), ~ sum(.x))) |>
filter(!mag_name %in% c('MAG_1507', 'MAG_769', 'MAG_983'))
## `summarise()` has grouped output by 'mag_name'. You can override using the
## `.groups` argument.
# need the environmental metadata
cariaco_env_sheets = excel_sheets('~/Documents/cariaco-tidy-analysis/stables/Cariaco_Supplementary_files_2022.xlsx')
cariaco_env_vars = read_excel('~/Documents/cariaco-tidy-analysis/stables/Cariaco_Supplementary_files_2022.xlsx',
sheet = cariaco_env_sheets[6])
cariaco_env_vars = cariaco_env_vars |>
filter(sample_type == 'metagenome') |>
select(-c(ncbi_metagenome_name, ncbi_metatranscriptome_name,
replicate, sample_type, year))
cariaco_env_vars = cariaco_env_vars |>
mutate(across(where(is.character), ~ if_else(.x == 'NA', NA_character_, .x))) |>
select(-c(NO3, H2S))
res_umap2 =
rnaseq_umap_tbl |>
ungroup() |>
select(-mag_name) |>
pivot_longer(
2:last_col(),
names_to = 'sample',
values_to = 'count') |>
pivot_wider(
names_from = contig,
values_from = count) |>
column_to_rownames('sample') |>
select(-where(~ sum(.x) == 0)) |>
as.matrix() |>
umap()
rna_hdbscan_res = hdbscan(res_umap2$layout, minPts = 5)
rna_hdbscan_res$cluster
## [1] 4 0 4 4 4 4 4 4 4 4 0 3 0 3 1 1 1 1 1 1 0 2 3 3 0 0 0 3 4 0 4 4 4 4 4 0 3 0
## [39] 3 3 4 0 2 2 2 2 0 3
umap_df2 <- res_umap2$layout |>
as.data.frame() |>
mutate(cluster = rna_hdbscan_res$cluster) |>
rename(UMAP1="V1",
UMAP2="V2") |>
rownames_to_column('sample') |>
as_tibble() |>
left_join(cariaco_env_vars, by = c('sample' = 'sample_name_used_for_this_study')) |>
filter(!is.na(depth))
# Metagenome UMAP analysis ----------------------------------------------------
dnaseq_umap_tbl = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/metaG_processed_mapping_tbl.rds')
dnaseq_umap_tbl = dnaseq_umap_tbl |> #rnaseq_mapping_counts |>
filter(gene_name %in% bgc_genes_10kb$gene_name) |>
mutate(contig = str_replace(gene_name, '(MAG_\\d+)-ctg(.*)_\\d+', '\\1_\\2'),
mag_name = str_replace(contig, '(MAG_\\d+)_.*', '\\1')) |>
relocate(c(mag_name, contig), .before = 1) |>
select(-gene_name) |>
group_by(mag_name, contig) |>
summarize(across(everything(), ~ mean(.x))) |>
filter(!mag_name %in% c('MAG_1507', 'MAG_769', 'MAG_983'))
## `summarise()` has grouped output by 'mag_name'. You can override using the
## `.groups` argument.
dnaseq_res_umap =
dnaseq_umap_tbl |>
ungroup() |>
select(-mag_name) |>
#column_to_rownames('contig') |>
pivot_longer(
2:last_col(),
names_to = 'sample',
values_to = 'count') |>
pivot_wider(
names_from = contig,
values_from = count) |>
left_join(col_metadata, by = c('sample' = 'column')) |>
mutate(depth = str_replace(sample, '.*(\\d{3}).*', '\\1'),
replicate = if_else(str_detect(sample, 'a'), 1, 2),
new_colname = paste0(season, fraction, depth, '_', replicate)) |>
select(-c(depth, replicate, season, fraction, sample, layer)) |>
relocate(new_colname) |>
column_to_rownames('new_colname') |>
select(-where(~ sum(.x) == 0)) |>
as.matrix() |>
umap()
hdbscan_res_dna = hdbscan(dnaseq_res_umap$layout, minPts = 5)
hdbscan_res_dna$cluster
## [1] 4 4 0 1 2 1 2 0 2 3 4 3 4 4 1 2 1 2 0 2 3 4 3 4 4 1 1 2 1 2 1 4 0 4 4 4 4 1
## [39] 1 2 1 2 1 0 3 4 0
dnaseq_umap_df <- dnaseq_res_umap$layout |>
as.data.frame() |>
mutate(cluster = hdbscan_res_dna$cluster) |>
rename(UMAP1="V1",
UMAP2="V2") |>
rownames_to_column('sample') |>
as_tibble() |>
left_join(cariaco_env_vars, by = c('sample' = 'sample_name_used_for_this_study')) |>
filter(!is.na(depth))
# create final joint umap plot - plotting code pasted here again ----------
#metaT plot
metaT_umap_result = umap_df2 |>
mutate(
size_fraction = if_else(
size_fraction == 'free-living (0.2-2.7 um)',
'Free-living (0.2-2.7 um)', 'Particle-associated (>2.7 um)'),
redox_regime =
str_to_title(redox_regime) |>
paste0(' - ', size_fraction)
) |>
ggplot(aes(x = UMAP1,
y = UMAP2,
color = time_point,
shape = redox_regime
)) +
geom_point(size = 4.5) +
labs(x = "\nUMAP1",
y = "UMAP2\n",
color = 'Time point',
shape = 'Redox regime/size fraction') +
theme_bw() +
scale_shape_manual(
values = c(0,15,1,16,2,17)) +
scale_color_manual(values = c(
'May' = '#009E73',
'November' = '#CC79A7')
) +
theme(
axis.title = element_text(size = 13),
axis.text = element_text(size = 12),
legend.text = element_text(size = 13),
legend.title = element_text(size = 13),
panel.grid = element_blank(),
legend.position = 'top') +
guides(shape = 'none')
# metaG plot
metaG_umap_result = dnaseq_umap_df |>
mutate(
size_fraction = if_else(
size_fraction == 'free-living (0.2-2.7 um)',
'Free-living (0.2-2.7 um)', 'Particle-associated (>2.7 um)'),
redox_regime =
str_to_title(redox_regime) |>
paste0(' - ', size_fraction)
) |>
ggplot(aes(x = UMAP1,
y = UMAP2,
color = time_point,
shape = redox_regime
)) +
geom_point(size = 4.5) +
labs(x = "\nUMAP1",
y = "UMAP2\n",
color = 'Time point',
shape = 'Redox regime/size fraction'
) +
theme_bw() +
scale_shape_manual(
values = c(0,15,1,16,2,17)) +
scale_color_manual(values = c(
'May' = '#E69F00',
'November' = '#56B4E9')
) +
theme(
axis.title = element_text(size = 13),
axis.text = element_text(size = 12),
legend.text = element_text(size = 13),
legend.title = element_text(size = 13),
panel.grid = element_blank(),
legend.position = 'top') +
guides(shape = 'none')
# just the metaT shape legend
metaT_shape_legend = umap_df2 |>
mutate(
size_fraction = if_else(
size_fraction == 'free-living (0.2-2.7 um)',
'Free-living (0.2-2.7 um)', 'Particle-associated (>2.7 um)'),
redox_regime =
str_to_title(redox_regime) |>
paste0(' - ', size_fraction)
) |>
ggplot(aes(x = UMAP1,
y = UMAP2,
color = time_point,
shape = redox_regime
)) +
geom_point(size = 4.5) +
labs(x = "\nUMAP1",
y = "UMAP2\n",
color = 'Time point',
shape = 'Redox regime/size fraction') +
theme_bw() +
scale_shape_manual(
values = c(0,15,1,16,2,17)) +
scale_color_manual(values = c(
'May' = '#009E73',
'November' = '#CC79A7')
) +
theme(
axis.title = element_text(size = 13),
axis.text = element_text(size = 12),
legend.text = element_text(size = 13),
legend.title = element_text(size = 13),
panel.grid = element_blank()) +
guides(color = 'none')
metaT_shape_legend = get_legend(metaT_shape_legend)
umap_final_joint_plot = plot_grid(
metaG_umap_result, NULL, metaT_umap_result, metaT_shape_legend,
nrow = 1,
rel_widths = c(1, 0.05, 1, 1),
labels = c('a', NA, 'b')
)
umap_final_joint_plot
## Warning: Removed 1 rows containing missing values (`geom_text()`).
Figure 4. Distribution of core and additional biosynthetic genes or domains and transcripts from biosynthetic gene clusters
final_domain_summary = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/final_domain_summary.rds')
smc_palette = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/smc_palette.rds')
final_de_and_clust_data = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/final_de_and_clust_data.rds')
# create plot showing the most frequently occurring core/auxiliary biosynthetic genes in the Cariao MAGs
all_common_domains_plot =
final_domain_summary %>%
group_by(description) %>%
add_tally() %>%
ungroup() %>%
filter(!is.na(product)) %>%
mutate(product = if_else(product == 'PKS', 'Polyketide', product)) %>%
mutate(description = case_when(
str_detect(description,
regex('Histidine kinase-', ignore_case = TRUE)) ~ 'ATPase',
str_detect(description, regex('ketoacyl synthase', ignore_case = TRUE)) ~ 'Beta-ketoacyl synthase domain',
str_detect(description, 'RRE') ~ 'RiPP Recognition Element protein',
str_detect(description, 'Coenzyme PQQ synthesis') ~ 'PqqD-like domain',
str_detect(description, 'Radical SAM') ~ 'Radical SAM domain',
str_detect(description, 'Glycosyl transferase') ~ 'Glycosyl transferase',
str_detect(description, 'AMP-binding') ~ 'AMP-binding domain',
str_detect(description, 'Squalene') ~ 'Squalene-hopene cyclase domain',
str_detect(description, 'Acyl-CoA dehydrogenase') ~ 'Acyl-CoA dehydrogenase domain',
str_detect(description, regex(
'FAD dependent oxidoreductase|FMN-dependent oxidoreductase', ignore_case = TRUE)) ~
'FAD/FMN-dependent oxidoreductase',
str_detect(description, 'FAD binding|FAD-binding|FMN binding') ~ 'FAD/FMN binding domain',
TRUE ~ description)) %>% #view()
filter(n > 0.2 * max(n) |
description == 'FAD/FMN-dependent oxidoreductase'|
description == 'FAD/FMN binding domain') %>% #pull(description) %>% unique()
mutate(group = 'Core and additional biosynthetic genes') %>% #pull(description) %>% tabyl() %>% view()
select(c(description, product, group)) %>%
group_by(description, product, group) %>%
mutate(number = n()) %>%
ungroup() %>%
group_by(description) %>%
add_tally() %>%
ungroup() %>%
distinct() %>%
arrange(desc(n)) %>%
filter(description != 'ABC Transporter') %>%
ggplot(aes(
y = fct_inorder(description),
x = number,
fill = fct_rev(fct_infreq(factor(product, exclude = NULL))))) +
geom_bar(stat = 'identity', position = 'stack') +
theme_bw() +
facet_wrap(~ group) +
theme_bw() +
theme(
strip.text = element_text(size = 11),
axis.text = element_text(size = 9),
axis.title.y = element_blank(),
strip.background = element_rect(fill = "wheat1"),
panel.grid = element_blank(),
axis.ticks.y = element_blank(),
legend.position = c(0.9875, 0.98),
legend.justification = c(1, 0.99),
legend.background = element_blank(),
panel.border = element_rect(color = 'black', fill = NA),
legend.box.background = element_rect(color = 'black'),
legend.title = element_text(size = 11),
legend.text = element_text(size = 10)
#legend.position = 'none'
) +
scale_fill_manual(values = c(smc_palette), na.value = 'white') +
labs(
fill = 'Secondary metabolite class',
y = 'Antibiotic resistance type',
x = '\nTotal genes\n'
) +
guides(fill = guide_legend(ncol = 2))
# # create plot showing the most frequently occurring core/auxiliary biosynthetic transcripts in the PA metatranscriptomes
pa_domains_plot =
final_de_and_clust_data %>%
filter(gene_name %in% final_domain_summary$gene_name) %>%
filter(is_sig_pa_biosynthetic | (pa_counts > 0 & fl_counts == 0)) %>%
rename(raw_product = product) %>%
left_join(final_domain_summary %>%
select(c(mag_name, contig, gene_name, product, description)),
by = c('contig', 'mag_name', 'gene_name')) %>%
mutate(product = case_when(
str_detect(product, regex('hybrid', ignore_case = FALSE)) ~ 'Hybrid cluster',
product == 'Other' ~ 'Other cluster*',
product == 'PKS' ~ 'Polyketide',
TRUE ~ product)) %>%
select(-n) %>%
filter(!is.na(product)) %>%
mutate(description = case_when(
str_detect(description,
regex('Histidine kinase-', ignore_case = TRUE)) ~ 'ATPase',
str_detect(description, regex('ketoacyl synthase', ignore_case = TRUE)) ~ 'Beta-ketoacyl synthase domain',
str_detect(description, 'RRE') ~ 'RiPP Recognition Element protein',
str_detect(description, 'Coenzyme PQQ synthesis') ~ 'PqqD-like domain',
str_detect(description, 'Radical SAM') ~ 'Radical SAM domain',
str_detect(description, 'Glycosyl transferase') ~ 'Glycosyl transferase',
str_detect(description, 'AMP-binding') ~ 'AMP-binding domain',
str_detect(description, 'Squalene') ~ 'Squalene-hopene cyclase domain',
str_detect(description, 'Acyl-CoA dehydrogenase') ~ 'Acyl-CoA dehydrogenase domain',
str_detect(description, regex(
'FAD dependent oxidoreductase|FMN-dependent oxidoreductase', ignore_case = TRUE)) ~
'FAD/FMN-dependent oxidoreductase',
str_detect(description, 'FAD binding|FAD-binding|FMN binding') ~ 'FAD/FMN binding domain',
TRUE ~ description)) %>%
filter(description %in% all_common_domains_plot$data$description) %>%
mutate(group = 'Particle-associated') %>% #pull(description) %>% tabyl() %>% view()
select(c(description, product, group)) %>%
group_by(description, product, group) %>%
mutate(number = n()) %>%
ungroup() %>%
group_by(description) %>%
add_tally() %>%
ungroup() %>%
distinct() %>%
arrange(desc(n)) %>%
filter(description != 'ABC Transporter') %>%
ggplot(aes(
y = factor(description,
levels = unique(all_common_domains_plot$data$description)),
x = number,
fill = fct_rev(fct_infreq(factor(product, exclude = NULL))))) +
geom_bar(stat = 'identity', position = 'stack') +
theme_bw() +
facet_wrap(~ group) +
theme_bw() +
theme(
strip.text = element_text(size = 11),
axis.text.x = element_text(size = 9),
axis.text.y = element_blank(),
axis.title.y = element_blank(),
strip.background = element_rect(fill = "wheat1"),
panel.grid = element_blank(),
axis.ticks.y = element_blank(),
legend.position = 'none') +
scale_fill_manual(values = c(smc_palette), na.value = 'white') +
labs(
fill = 'Secondary metabolite class',
y = 'Antibiotic resistance type',
x = '\nTotal transcripts') +
scale_x_continuous(limits = c(0, 200))
# # create plot showing the most frequently occurring core/auxiliary biosynthetic transcripts in the FL metatranscriptomes
fl_domains_plot =
final_de_and_clust_data %>%
filter(gene_name %in% final_domain_summary$gene_name) %>%
filter(is_sig_fl_biosynthetic | (fl_counts > 0 & pa_counts == 0)) %>%
rename(raw_product = product) %>%
left_join(final_domain_summary %>%
select(c(mag_name, contig, gene_name, product, description)),
by = c('contig', 'mag_name', 'gene_name')) %>%
mutate(product = case_when(
str_detect(product, regex('hybrid', ignore_case = FALSE)) ~ 'Hybrid cluster',
product == 'Other' ~ 'Other cluster*',
product == 'PKS' ~ 'Polyketide',
TRUE ~ product)) %>%
select(-n) %>%
filter(!is.na(product)) %>%
mutate(description = case_when(
str_detect(description,
regex('Histidine kinase-', ignore_case = TRUE)) ~ 'ATPase',
str_detect(description, regex('ketoacyl synthase', ignore_case = TRUE)) ~ 'Beta-ketoacyl synthase domain',
str_detect(description, 'RRE') ~ 'RiPP Recognition Element protein',
str_detect(description, 'Coenzyme PQQ synthesis') ~ 'PqqD-like domain',
str_detect(description, 'Radical SAM') ~ 'Radical SAM domain',
str_detect(description, 'Glycosyl transferase') ~ 'Glycosyl transferase',
str_detect(description, 'AMP-binding') ~ 'AMP-binding domain',
str_detect(description, 'Squalene') ~ 'Squalene-hopene cyclase domain',
str_detect(description, 'Acyl-CoA dehydrogenase') ~ 'Acyl-CoA dehydrogenase domain',
str_detect(description, regex(
'FAD dependent oxidoreductase|FMN-dependent oxidoreductase', ignore_case = TRUE)) ~
'FAD/FMN-dependent oxidoreductase',
str_detect(description, 'FAD binding|FAD-binding|FMN binding') ~ 'FAD/FMN binding domain',
TRUE ~ description)) %>%
filter(description %in% all_common_domains_plot$data$description) %>%
mutate(group = 'Free-living') %>% #pull(description) %>% tabyl() %>% view()
select(c(description, product, group)) %>%
group_by(description, product, group) %>%
mutate(number = n()) %>%
ungroup() %>%
group_by(description) %>%
add_tally() %>%
ungroup() %>%
distinct() %>%
arrange(desc(n)) %>%
filter(description != 'ABC Transporter') %>%
ggplot(aes(
y = factor(description,
levels = unique(all_common_domains_plot$data$description)),
x = number,
fill = fct_rev(fct_infreq(factor(product, exclude = NULL))))) +
geom_bar(stat = 'identity', position = 'stack') +
theme_bw() +
facet_wrap(~ group) +
theme_bw() +
theme(
strip.text = element_text(size = 11),
axis.text = element_text(size = 9),
axis.title.y = element_blank(),
strip.background = element_rect(fill = "wheat1"),
panel.grid = element_blank(),
axis.ticks.y = element_blank(),
legend.position = 'none') +
scale_fill_manual(values = c(smc_palette), na.value = 'white') +
labs(
fill = 'Secondary metabolite class',
y = 'Antibiotic resistance type',
x = '\nTotal transcripts') +
scale_x_reverse(limits = c(200, 0))
# combine the PA and FL transcript expression bar plots
final_domains_plotgrid = plot_grid(
fl_domains_plot,
pa_domains_plot,
nrow = 1,
rel_widths = c(0.65, 0.35)
)
# combine the expression barplots with the genomic potential bar plot to create the final plot
FINAL_ALL_DOMAINS_PLOT =
plot_grid(all_common_domains_plot,
final_domains_plotgrid,
ncol = 1,
labels = 'auto',
rel_heights = c(0.51, 0.49))
FINAL_ALL_DOMAINS_PLOT
≤5% contamination) by bacterial (a) and archaeal phylum (b)
# Distribution of MAG phyla recovered and average genome completeness per phylum --------------------------------
sm_key = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/sm_key.rds')
mag_table = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/mag_table.rds')
heat_colors <- c(colorRampPalette(colors = 'white')(1),
colorRampPalette(colors = c('#ABD9E9',
'#FEE090',
'orange',
'#D73027'))(8000))
sm_summary <- sm_key %>%
select(mag_name, cluster, sm_class) %>%
distinct() %>%
left_join(mag_table, by = 'mag_name')
# plot used
mag_dist_plot_archaea <- mag_table %>%
group_by(phylum) %>%
summarize(freq = n()) %>%
arrange(desc(freq)) %>%
mutate(x = row_number(),
phylum) %>%
left_join(mag_table %>%
select(phylum, domain, completeness) %>%
group_by(phylum) %>%
add_tally(mean(completeness), name = 'average_completeness') %>%
select(-completeness) %>%
ungroup() %>%
distinct(),
by = 'phylum') %>%
mutate(phylum = if_else(is.na(phylum), 'Unclassified', phylum)) %>%
mutate(phylum = as_factor(phylum)) %>%
filter(domain == 'Archaea') %>%
ggplot(aes(phylum, freq)) +
theme_bw() +
theme(panel.grid = element_blank(),
axis.text.x = element_text(angle = 65, hjust = 1, size = 8),
axis.title.x = element_text(size = 10),
axis.title.y = element_blank(),
strip.text = element_text(size = 10),
strip.background = element_rect(fill = 'wheat1'),
axis.text.y = element_text(size = 8),
legend.position = 'none'
) +
geom_segment(aes(xend = phylum, yend = 0), color = 'grey50') +
geom_point(size = 1.5,
aes(color = average_completeness)) +
geom_text(aes(label = freq, vjust = -0.65), size = 2.5) +
facet_grid(~ domain, scales = 'free', space = 'free') +
labs(x = 'Phylum',
y = 'Number of MAGs per phylum\n',
color = 'Average genome
completeness per phylum',
) +
scale_color_gradientn(colors = heat_colors,
breaks = seq(75, 100, by = 5),
limits = c(79, 100)
) +
scale_y_continuous(limits = c(0, 75),
expand = expansion(add = c(1, 0)))
mag_dist_plot_bacteria =
mag_table %>%
filter(!mag_name %in% c('MAG_1507', 'MAG_769', 'MAG_983')) %>%
group_by(phylum) %>%
summarize(freq = n()) %>%
arrange(desc(freq)) %>%
mutate(x = row_number(),
phylum) %>%
left_join(mag_table %>%
select(phylum, domain, completeness) %>%
group_by(phylum) %>%
add_tally(mean(completeness), name = 'average_completeness') %>%
select(-completeness) %>%
ungroup() %>%
distinct(),
by = 'phylum') %>%
mutate(phylum = if_else(is.na(phylum), 'Unclassified', phylum)) %>%
mutate(phylum = as_factor(phylum)) %>%
filter(domain == 'Bacteria') %>%
ggplot(aes(phylum, freq)) +
theme_bw() +
theme(panel.grid = element_blank(),
axis.text.x = element_text(angle = 65, hjust = 1, size = 8),
axis.title = element_text(size = 10),
strip.text = element_text(size = 10),
strip.background = element_rect(fill = 'wheat1'),
legend.title = element_text(size = 8),
legend.direction = 'horizontal',
legend.margin = margin(c(4,15,4,4)),
legend.text = element_text(size = 8),
axis.text.y = element_text(size = 8),
legend.position = c(0.71, 0.87),
legend.background = element_rect(fill = "white",
color = "black")
) +
geom_segment(aes(xend = phylum, yend = 0), color = 'grey50') +
geom_point(size = 1.5,
aes(color = average_completeness)) +
geom_text(aes(label = freq, vjust = -0.65), size = 2.5) +
facet_grid(~ domain, scales = 'free', space = 'free') +
labs(x = 'Phylum',
y = 'Number of MAGs per phylum\n',
color = 'Average genome
completeness per phylum',
) +
scale_color_gradientn(colors = heat_colors,
breaks = seq(75, 100, by = 5),
limits = c(79, 100)
) +
scale_y_continuous(limits = c(0, 75),
expand = expansion(add = c(1, 0)))
mag_dist_plot = plot_grid(mag_dist_plot_bacteria, mag_dist_plot_archaea,
labels = c('a', 'b'),
rel_widths = c(4, 1),
nrow = 1,
label_size = 11.5,
align = 'hv',
label_x = c(0.045, 0.205))
mag_dist_plot
Process the CoverM relative abundance output files, plot relative abundance of the most abundant MAGs
mag_table = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/mag_table.rds')
final_genome_rel_abun_tbl = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/rel_abun_tibble_48_samples.rds')
# create sample names key
metaG_sampleNames_key.X = tibble(
sample_name = colnames(final_genome_rel_abun_tbl)[2:ncol(final_genome_rel_abun_tbl)],
depth = str_replace(sample_name, '.*(\\d{3}).*', '\\1'),
fraction = if_else(str_detect(sample_name, 'A'), 'PA', 'FL'),
season = case_when(
str_detect(sample_name, 'AM|BM') ~ 'M',
str_detect(sample_name, 'AN|BN') ~ 'N',
depth %in% c(103,198,234,295,314) ~ 'M',
depth %in% c(143,200,237,247,267) ~ 'N'),
replicate = if_else(str_detect(sample_name, 'a'), 1, 2),
new_sample_name = paste0(season, fraction, depth, '_', replicate)
)
all(colnames(final_genome_rel_abun_tbl)[2:ncol(final_genome_rel_abun_tbl)] == metaG_sampleNames_key.X$sample_name)
## [1] TRUE
#TRUE
# color palette listing the color choices for all annotation rows/columns on the heatmap
relPalette <- list(Layer = c(
'Oxycline' = 'darkslategray3',
'Shallow anoxic' = 'gold',
'Euxinic' = 'sienna3'),
Fraction = c('PA' = 'darkgreen',
'FL' = 'darkseagreen2'),
Season = c('May' = 'slategray',
'November' = 'slategray1'),
`Depth (m)` = c(
'103' = 'grey90',
'143' = 'grey85',
'198' = 'grey80',
'200' = 'grey75',
'234' = 'grey70',
'237' = 'grey65',
'247' = 'grey60',
'267' = 'grey55',
'295' = 'grey50',
'314' = 'grey45',
'900' = 'grey20'
),
`Phylum/Class` = c('Planctomycetota' = 'wheat1',
'Desulfobacterota' = 'tomato',
'Myxococcota' = 'orange',
'Alphaproteobacteria' = 'orchid1',
'Gammaproteobacteria' = 'thistle1',
'Chloroflexota' = 'slateblue1',
'Marinisomatota' = 'aquamarine',
'Aenigmarchaeota' = 'gold',
'Krumholzibacteriota' = 'darkred',
'Omnitrophota' = 'darkviolet',
'Verrucomicrobiota' = 'khaki',
'Thermoplasmatota' = 'firebrick1',
'AABM5-125-24' = 'gainsboro',
'Bacteroidota' = 'springgreen',
'SAR324' = 'chocolate4',
'Cloacimonadota' = 'grey28',
'Crenarchaeota' = 'yellowgreen',
'Acidobacteriota' = 'deepskyblue',
'Actinobacteriota' = '#666666',
'Eisenbacteria' = '#B89B74',
'Gemmatimonadota' = 'lightsteelblue1',
'Altiarchaeota' = 'ivory',
'Nitrospinota' = '#2A7FB7',
'Fermentibacterota' = '#1B9E77',
'Latescibacterota' = '#AD4C9E',
'Patescibacteria' = 'dodgerblue4',
'Undinarchaeota' = 'turquoise',
'Unclassified' = '#5A8950'
))
mag_table = mag_table %>%
mutate(phylum = if_else(is.na(phylum), 'Unclassified', phylum))
final_genome_rel_abun_tbl =
final_genome_rel_abun_tbl %>%
rename(mag_name = Genome) %>%
filter(!mag_name %in% c('unmapped', paste0('CarAnox_mtb2_', c(1507, 983, 769)))) %>%
mutate(mag_name = str_replace(mag_name, 'CarAnox_mtb2', 'MAG')) %>%
left_join(
mag_table %>%
select(mag_name, tax_num),
by = 'mag_name'
) %>%
relocate(tax_num, .after = 1) %>%
arrange(tax_num) %>%
select(-mag_name) %>%
column_to_rownames(var = 'tax_num') %>%
mutate(across(1:last_col(), ~ log1p(.x)), # log-transform relative abundance percentages
row_sums = rowSums(.[1:ncol(.)])) %>%
filter(row_sums > 0.005 * max(row_sums)) %>% # keep only the most abundant MAGs across the samples
select(-row_sums) %>%
rename_with(
~ metaG_sampleNames_key.X$new_sample_name,
.cols = 1:last_col()) %>%
relocate(
metaG_sampleNames_key.X %>%
arrange(desc(fraction), depth) %>%
pull(new_sample_name),
.after = 1)
# create character vector of phylums of the most abundant MAGs, sorted by the most to the least frequently appearing phylum
# split Proteobacteria in Alpha- and Gammaproteobacteria classes
phylum_order =
final_genome_rel_abun_tbl %>%
mutate(row_sums = rowSums(.),
tax_num = rownames(.)) %>%
left_join(
mag_table %>%
select(tax_num, phylum, class),
by = 'tax_num') %>%
select(c(row_sums, tax_num, phylum, class)) %>%
mutate(phylum = if_else(phylum == 'Proteobacteria', class, phylum)) %>%
select(-class) %>%
group_by(phylum) %>%
mutate(group_sum = sum(row_sums)) %>%
ungroup() %>%
arrange(desc(group_sum), phylum) %>%
pull(phylum) %>%
unique()
#change UAP2
phylum_order[28] = 'Undinarchaeota'
# create final relative abundance matrix to be used for plotting
final_genome_rel_abun_tbl =
final_genome_rel_abun_tbl %>%
mutate(row_sums = rowSums(.),
tax_num = rownames(.)) %>%
left_join(
mag_table %>%
select(tax_num, phylum, class),
by = 'tax_num') %>%
mutate(phylum = if_else(
phylum == 'Proteobacteria', class, phylum)) %>%
select(-class) %>%
group_by(phylum) %>%
mutate(group_sum = sum(row_sums)) %>%
ungroup() %>%
arrange(desc(group_sum), phylum) %>%
select(-c(row_sums, group_sum, phylum)) %>%
column_to_rownames(var = 'tax_num') %>%
relocate( # function rearrange the columns by season, and water layer - incrementally increasing in depth
colnames(.) %>%
{function(x) {
tbl = tibble(
mpa = x[str_detect(x, 'MPA')],
npa = sort(c('NPA143_2', x[str_detect(x, 'NPA')])),
mfl = x[str_detect(x, 'MFL')],
nfl = x[str_detect(x, 'NFL')]
) %>%
mutate(groupper = rep(paste0('group_',
seq(1, 6, by = 1)),
each = 2)) %>%
group_by(groupper) %>%
summarize(across(everything(), ~
paste0(.x, collapse = ';')),
.groups = 'drop') %>%
select(-groupper)
pa = as.vector(rbind(tbl$mpa, tbl$npa))
fl = as.vector(rbind(tbl$mfl, tbl$nfl))
g = c(pa, fl)
g = str_split(g, pattern = ';') %>% unlist()
g = g[g != 'NPA143_2']
return(g)}}()
) %>%
as.matrix()
# row metadata for relative abundance plot - the phylum/class of each MAG
row_anno =
final_genome_rel_abun_tbl %>%
as.data.frame() %>%
rownames_to_column(var = 'tax_num') %>%
select(tax_num) %>%
left_join(
mag_table %>%
select(tax_num, phylum, class),
by = 'tax_num') %>%
as_tibble() %>%
mutate(phylum = if_else(
phylum == 'Proteobacteria', class, phylum)) %>%
select(-class) %>%
mutate(phylum = if_else(phylum == 'UAP2', 'Undinarchaeota', phylum)) %>%
rename(`Phylum/Class` = phylum) %>%
arrange(factor(`Phylum/Class`, levels = phylum_order)) %>%
column_to_rownames(var = 'tax_num')
# sort the Phylum/Class color palette by frequency
relPalette$`Phylum/Class` = relPalette$`Phylum/Class`[unique(row_anno$`Phylum/Class`)]
# column metadata for the relative abundance heatmap; annotation codings for sample fraction, sampling season, water layer, and water depth
col_anno = tibble(
cols = colnames(final_genome_rel_abun_tbl),
Fraction = str_replace(cols, '[A-Z]{1}([A-Z]{2})\\d{3}.*', '\\1'),
Season = if_else(str_detect(cols, 'M'), 'May', 'November'),
Layer = str_replace(cols, '[A-Z]+(\\d+)_.*', '\\1')
) %>%
mutate(Layer = as.integer(Layer),
Layer = case_when(
Layer < 247 ~ 'Oxycline',
Layer >= 247 & Layer < 900 ~ 'Shallow anoxic',
TRUE ~ 'Euxinic'
)) %>%
mutate(`Depth (m)` = str_replace(
cols, '.*(\\d{3}).*', '\\1'
)) %>%
column_to_rownames(var = 'cols') %>%
relocate(`Depth (m)`, Fraction, Season, Layer)
# create the heatmap
final_genome_rel_abun_plot = final_genome_rel_abun_tbl %>%
pheatmap::pheatmap(
name = 'Relative abundance (% total reads)',
angle_col = '315',
fontsize = 8,
annotation_row = row_anno,
annotation_col = col_anno,
annotation_colors = relPalette,
show_colnames = FALSE,
gaps_col = 23,
clustering_method = 'ward.D',
cluster_rows = FALSE,
cluster_cols = FALSE,
show_rownames = FALSE,
treeheight_row = 0,
treeheight_col = 0,
color =
c(colorRampPalette(colors = 'white')(1),
colorRampPalette(colors = c(
'#ebf4f7',
'#aae6fa',
'#f7e96d',
'#f7e43b',
'orange',
'#ff6a00',
'firebrick1',
'darkorchid3',
'purple4'))(3000)),
legend_breaks = seq(0, 3.5, by = 0.5),
legend_labels = ComplexHeatmap::gt_render(c(
'0% ',
'0.64% ',
'1.72% ',
'3.48% ',
'6.39% ',
'11.18% ',
'19.09% ',
'32.12% '))
)
final_genome_rel_abun_plot
Load required libraries, create matrix of counts. Each column will represent a metagenomic sample and rows will correspond to metagenome-assembled genomes (MAGs). Each cell will contain the number of reads that mapped to the contigs of a MAG.
library(furrr) # library of purrr-style iterators that utilize multiple cores for parallel processing
library(DESeq2) # library for running differential abundance analysis
library(BiocParallel) # for DESeq2 parallel processing
# if using a multicore machine, example furrr parallel processing parameters are below:
plan(multicore, workers = 35) # 35 cores allotted to furrr's multicore iterators
options(future.globals.maxSize = 5242880000) #5000 * 1024^2; 5Gb memory allotted to each core
# set directory to where the coverM files are located
setwd('~/path/to/github_data/coverM/dna/final_counts_of_reads_mapped_to_genomes/')
files = system('ls *.tsv', intern = TRUE)
sample_names = str_replace(files, '_counts_reads_mapped_per_genome.tsv', '')
# Create a list of read-mapping results (in tibble format); mapped reads to the MAGs (all contigs)
# Note: furrr's iterators are implemented like purrr's with the added 'future_' prepended to the function name; it will call the iterations in parallel across multiple cores
mapped_read_counts = future_map2(
files,
sample_names, ~
vroom::vroom(
file = .x,
delim = '\t',
col_names = c('mag_name', .y),
skip = 1,
show_col_types = FALSE
)
)
# Join all read-mapping results together into a tibble by the column of MAG names in each list component
mapped_read_counts =
reduce(mapped_read_counts, left_join, by = 'mag_name')
Load additional data related to the differential abundance analysis, prep the DESeq2 object and run DESeq2
# Load a tibble with taxonomic and assembly information for each MAG
#mag_table <- readRDS('/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/coverM/deseq2_dnaseq_rdata/draftMag-taxonomic-table-cariaco.RDA')
mag_table = read_csv('rdata-files/mags_75compl_5cont.csv', show_col_types = FALSE)
taxonomic_ranks = c('domain', 'phylum', 'class', 'order', 'family', 'genus', 'species')
mag_table = mag_table |>
rename(mag_name = bin_id) |>
mutate(mag_name = str_replace(mag_name, 'CarAnox_mtb2-', 'MAG_')) |>
select(mag_name, classification, completeness, contamination, strain_het) |>
separate(classification, into = taxonomic_ranks, sep = ';') |>
mutate(across(all_of(taxonomic_ranks), ~ str_replace(.x, '[:alpha:]__', ''))) |>
mutate(across(all_of(taxonomic_ranks), ~ if_else(.x == '', NA_character_, .x))) |>
mutate(phylum = if_else(is.na(phylum), 'Unclassified', phylum),
tax_num = str_replace(mag_name, 'MAG', phylum), .after = mag_name)
# Create metadata tibble for the metagenomic samples
sampleNames_key <- tibble(
sample_name = colnames(mapped_read_counts)[2:ncol(mapped_read_counts)]) %>%
mutate(depth = str_replace(sample_name, '.*(\\d{3}).*', '\\1'),
season = case_when(
(depth == 103 | depth == 198 | depth == 234 | depth == 295 | depth == 314) ~ 'M',
(depth == 143 | depth == 200 | depth == 237 | depth == 247 | depth == 267) ~ 'N',
str_detect(sample_name, 'M') ~ 'M',
str_detect(sample_name, 'N') ~ 'N'),
fraction = if_else(str_detect(sample_name, 'A'), 'PA', 'FL'),
replicate = if_else(str_detect(sample_name, 'a'), 1, 2),
new_column_names = paste0(season, fraction, depth, '_', replicate)) %>%
arrange(factor(sample_name, levels = colnames(select(mapped_read_counts, -mag_name))))
all(sampleNames_key$sample_name == colnames(select(mapped_read_counts, -mag_name))) #TRUE
# Create sample metadata for use in the DESeq() function. This analysis will look for differential abundance patterns between fractions in three different chemically distinct water layers: the oxycline, shallow anoxic, and euxinic depths
coldata <- sampleNames_key %>%
select(depth, fraction, new_column_names) %>%
mutate(layer = case_when(depth == 900 ~ 'euxinic',
depth >= 103 & depth <= 237 ~ 'oxycline',
depth > 237 & depth < 900 ~ 'shallow.anoxic')) %>%
select(-depth) %>%
dplyr::rename(sample = new_column_names) %>%
unite(group, c(fraction, layer), sep = '_') %>%
relocate(sample) %>%
mutate(group = as_factor(group))
# Rename the mapped-read counts tibble with metagenomic sample naming scheme used in the paper
mapped_read_counts = mapped_read_counts %>%
rename_with(
~ sampleNames_key$new_column_names,
.cols = 2:last_col()) %>%
column_to_rownames(var = 'mag_name')
# Create the DESeq2 analysis object
da_dds <- DESeq2::DESeqDataSetFromMatrix(countData = mapped_read_counts,
colData = coldata,
design = ~ group)
# Run the DESeq2 statistical model on the data; assumes the data have a negative binomial distribution
da_final_dds <- DESeq(da_dds)
Extract the results from the oxycline, shallow anoxic, and euxinic depths that compare PA and FL fraction types
# pull Wald test results comparing the oxycline PA to FL fractions
oxycline <- results(da_final_dds, contrast = c('group',
'PA_oxycline', 'FL_oxycline'), alpha = 0.05)
# pull Wald test results comparing the euxinic PA to FL fractions
euxinic <- results(da_final_dds, contrast = c('group',
'PA_euxinic', 'FL_euxinic'), alpha = 0.05)
# pull Wald test results comparing the shallow anoxic PA to FL fractions
shallow_anoxic <- results(da_final_dds, contrast = c('group', 'PA_shallow.anoxic',
'FL_shallow.anoxic'), alpha = 0.05)
# create a comprehensive tibble including all differential abundance results
# classify MAG fraction "preference" based on the direction of log2FoldChange and the adjusted p-values
allDaResults <- list(oxycline = oxycline,
shallow_anoxic = shallow_anoxic,
euxinic = euxinic) %>%
imap(~ .x %>%
as.data.frame() %>%
rownames_to_column(var = 'mag_name') %>%
mutate(mag_name = str_replace(mag_name, 'CarAnox_mtb2', 'MAG')) %>%
as_tibble() %>%
mutate(preference = case_when(log2FoldChange > 0 & padj < 0.05 ~ 'particle enriched',
log2FoldChange < 0 & padj < 0.05 ~ 'free-living enriched',
padj > 0.05 | is.na(padj) ~ 'no significant difference'),
layer = .y) %>%
left_join(mag_table %>% select(mag_name, phylum), by = 'mag_name') %>%
mutate(phylum = if_else(is.na(phylum), 'Unclassified', phylum),
preference = factor(preference,
levels = c(
'particle enriched',
'free-living enriched',
'no significant difference'))))
# save the differential abundance final results tibble
saveRDS(allDaResults, file = '~/path/to/github_data/coverM/dna/rdata/deseq2_dnaseq_rdata/deseq2_results_by_water_layer.rds')
Plot the differential abundance results
library(tidyverse) # loads several useful data-wrangling and plotting libraries
library(gridtext) # for editing text in heatmap plots
library(ComplexHeatmap) # for heatmap plots
library(pheatmap) # for heatmap plots
library(cowplot) # for combining various plots together in grids
library(ggtext) # used for ggplots
library(glue) # used for ggplots
#locally load DESeq2 results
allDaResults = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/deseq2_results_by_water_layer.rds')
## plotting
magFractionPref <- allDaResults %>%
map_dfr(~
.x %>%
mutate(layer = case_when(
layer == 'oxycline' ~ 'Oxycline',
layer == 'euxinic' ~ 'Euxinic',
layer == 'shallow_anoxic' ~ 'Shallow anoxic',
))
)
magFractionPref_plot = magFractionPref %>%
ggplot(aes(y = fct_rev(fct_infreq(phylum)),
fill = fct_relevel(preference, c('no significant difference',
'free-living enriched',
'particle enriched')))) +
geom_bar() +
labs(x = '\nNumber of genomes', y = 'Phylum\n', fill = 'Fraction preference') +
theme_bw() +
scale_fill_manual(values = c('grey80', 'palegreen3', 'burlywood2')) +
facet_wrap(~ fct_relevel(layer, c('Oxycline', 'Shallow anoxic', 'Euxinic'))) +
theme(
axis.title = element_text(size = 10),
strip.text = element_text(size = 10),
strip.background = element_rect(fill = 'wheat1'),
panel.grid = element_blank(),
legend.position = 'top',
legend.justification = c(0.92,0)
)
magFractionPref_plot
bgc_summary = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/bgc_summary.rds')
bgc_10k_kb = bgc_summary %>%
mutate(size = end - start, .after = contig) %>%
filter(size >= 10000)
bgc_count_data = bgc_10k_kb %>%
mutate(mag_name = str_replace(contig, '(MAG_\\d+)_.*', '\\1'), .before = contig) %>%
filter(!mag_name %in% c('MAG_1507', 'MAG_769', 'MAG_983')) %>% # remove MAGs that are from laboratory contamination
select(-mag_name) %>%
mutate(clust = paste0('clust_', 1:n()), .before = size) %>%
group_by(contig) %>%
group_split() %>%
map_dfr(~ .x %>%
mutate(ID.match = map2(
1:length(start), list(.), ~
.y %>%
filter(
start > start[.x] & start < end[.x] | # start of the BGC is greater than your start, and less than your end
start < start[.x] & end > start[.x] | # start of the BGC is less than your start, and end is greater than your start
start == start[.x] & start < end[.x]) %>% # start of the BGC is equal to your start, and less than your end
pull(clust)),
.before = start)) %>%
group_by(contig) %>%
mutate(concat = list(unlist(ID.match)), .after = ID.match) %>%
mutate(concat = map(concat, ~ unique(.x[duplicated(.x)]))) %>%
mutate(length_concat = length(concat), .after = concat) %>%
mutate(n = n(), .after = length_concat) %>%
ungroup()
all(bgc_count_data$length_concat == bgc_count_data$n) # TRUE; all contigs w/ multiple BGCs have some level of overlap amongst them; can classify these all as 'hybrid clusters' of various sorts
## [1] TRUE
bgc_count_data = bgc_count_data %>%
select(-c(ID.match, concat, length_concat)) %>%
group_by(contig) %>%
mutate(is_hybrid = if_else(n() > 1, TRUE, FALSE), .after = contig)
bgc_count_data_summary = bgc_count_data %>%
ungroup() %>%
select(-detection_rule) %>%
mutate(product = str_replace(product, '-like', ''),
product = str_replace(product, '.*PKS', 'PKS')) %>%
arrange(contig, product) %>%
group_by(contig, is_hybrid) %>%
select(-c(filename, filepath)) %>%
summarize(across(everything(), ~ paste0(.x, collapse = ';')), .groups = 'drop') %>%
mutate(product = if_else(
str_detect(product, ';'), str_replace(product, '(.*)', '\\1 hybrid'), product),
product = str_replace_all(product, ';', '-')
)
bgc_count_data_summary = bgc_count_data_summary %>%
mutate(
product = case_when(
!is_hybrid & str_detect(product, 'lanthipeptide|RRE-containing|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides') ~ 'RiPP',
!is_hybrid & str_detect(product, 'lactone') ~ 'Lactone',
!is_hybrid & str_detect(product, 'acyl_amino_acids|nucleoside|PBDE|indole|siderophore|resorcinol|ectoine|NAPAA|CDPS|LAP|redox-cofactor') ~ 'other',
!is_hybrid & str_detect(product, 'hglE-KS') ~ 'PKS',
!is_hybrid & str_detect(product, 'arylpolyene') ~ 'Aryl polyene',
!is_hybrid & str_detect(product, 'redox-cofactor') ~ 'Redox-cofactor',
#
is_hybrid & str_detect(product, 'NRPS') & str_detect(product, 'PKS|hglE-KS') ~ 'NRPS-PKS hybrid',
is_hybrid & str_detect(product, 'NRPS') & !str_detect(product, 'PKS|hglE-KS|lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP') ~ 'NRPS hybrid',
is_hybrid & str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP') & !str_detect(product, 'PKS|NRPS|hglE-KS') ~ 'RiPP hybrid',
is_hybrid & str_detect(product, 'PKS|hglE-KS') & !str_detect(product, 'NRPS|lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP') ~ 'PKS hybrid',
is_hybrid & str_detect(product, 'NRPS') & str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP') & !str_detect(product, 'PKS|hglE-KS') ~ 'NRPS-RiPP hybrid',
is_hybrid & str_detect(product, 'PKS|hglE-KS') & str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP') & !str_detect(product, 'NRPS') ~ 'PKS-RiPP hybrid',
is_hybrid & str_detect(product, 'terpene') & !str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP|PKS|NRPS|hglE-KS') ~ 'Terpene hybrid',
is_hybrid & str_detect(product, 'ladderane') & !str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP|PKS|NRPS|hglE-KS|terpene') ~ 'Ladderane hybrid',
is_hybrid & str_detect(product, 'arylpolyene') & !str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP|PKS|NRPS|hglE-KS|terpene') ~ 'Aryl polyene hybrid',
is_hybrid & str_detect(product, 'lactone') & !str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP|PKS|NRPS|hglE-KS|terpene') ~ 'Lactone hybrid',
is_hybrid & str_detect(product, 'CDPS') & !str_detect(product, 'lanthipeptide|RRE|thiopeptide|proteusin|lassopeptide|sactipeptide|ranthipeptide|thioamitides|RiPP|PKS|NRPS|hglE-KS|terpene') ~ 'Other hybrid',
#
TRUE ~ product
)) %>%
mutate(product = if_else(str_detect(product, '^[:lower:]'), str_to_title(product), product))
bgc_count_data_summary = bgc_count_data_summary %>%
mutate(mag_name = str_replace(contig, '(MAG_\\d+)_\\d+', '\\1'), .before = contig) %>% #create mag name col
left_join(mag_table %>% select(mag_name, domain, phylum), by = 'mag_name') %>% # add phylum, domain cols
relocate(c(domain, phylum), .after = 1) %>% # relocate cols
select(c(phylum, domain, product)) %>% # select some cols
mutate(n_product_in_phylum = 1,
product = if_else(str_detect(product, 'hybrid'), 'Hybrid cluster', product)) %>% # edit product col, add sum col, sm class
group_by(phylum, domain, product) %>%
summarize(n_product_in_phylum = sum(n_product_in_phylum), .groups = 'drop') %>% # sum up each type of bgc in each phylum
left_join(mag_table %>%
select(phylum) %>%
group_by(phylum) %>%
summarize(n_mags_in_phylum = n()),
by = 'phylum') %>% # join col that says how many MAGs are in each phylum
mutate(norm_sm_freq = n_product_in_phylum / n_mags_in_phylum, # divide #bgc / # mag per phylum for each bgc per phylum
product = if_else(product == 'Other', 'Other cluster*', product)) %>%
#
group_by(phylum) %>%
add_tally(sum(norm_sm_freq), name = 'total_sm_freq') %>%
ungroup() %>%
arrange(desc(total_sm_freq), phylum) %>%
mutate(phylum = if_else(is.na(phylum), 'Unclassified', phylum))
# Pie chart of Cariaco MAG BGCs groupped by class -------------------------
final_pie =
bgc_count_data_summary %>%
mutate(product = if_else(product == 'PKS', 'Polyketide', product)) %>%
select(product, n_product_in_phylum) %>%
group_by(product) %>%
summarize(n_product_in_phylum = sum(n_product_in_phylum)) %>%
left_join(
tibble(
product = names(smc_palette),
color = unname(smc_palette)),
by = 'product')
library(plotly) # load plotly for plotting
##
## Attaching package: 'plotly'
## The following object is masked _by_ '.GlobalEnv':
##
## highlight
## The following object is masked from 'package:ComplexHeatmap':
##
## add_heatmap
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
# create the pie chart and save it
final_bgc_pie = plot_ly(final_pie,
labels = ~product,
values = ~n_product_in_phylum,
type = 'pie',
textposition = 'inside',
textinfo = 'label+percent+value',
insidetextfont = list(color = '#FFFFFF'),
text = ~n_product_in_phylum,
marker = list(colors = ~color,
line = list(
color = '#FFFFFF',
width = 1)),
showlegend = FALSE) %>%
layout(xaxis = list(
showgrid = FALSE,
zeroline = FALSE,
showticklabels = FALSE),
yaxis = list(
showgrid = FALSE,
zeroline = FALSE,
showticklabels = FALSE))
final_bgc_pie
Process the .paf output files, filter poor alignments, save the results
# load required libraries
library(tidyverse)
library(furrr)
#set parallel processing params
plan(multicore, workers = 48)
options(future.globals.maxSize = 52428800000)
# metadata file of gene names
gene_metadata = tibble(
gene_name = readLines('/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/FINAL_MAY_2022_ALL_ANTISMASH6_CONCAT_GENE_FASTA_INDEX/final_concat_gene_names.txt')
) %>%
separate(gene_name,
into = c('gene_name', 'start', 'end', 'strand', 'extra_data'),
sep = ' # ')
gene_metadata = gene_metadata %>%
mutate(gene_name = str_replace(gene_name, '^>', ''))
empty_gene_counts_filler = gene_metadata %>%
select(gene_name) %>%
mutate(n_reads_mapped = 0L)
rm(gene_metadata)
# set working directory to the one with the .paf output files
setwd('/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/minimap2_metaT_mapping_final_results_used')
# create vector of .paf full file paths
file_paths = system('ls *.paf', intern = TRUE)
sample_names = str_replace(file_paths, '.paf', '')
#function to process the .paf files
prepare_read_mappings = function(.empty_gene_counts_filler, .file_path) {
# extract metaT sample name from .file_path
.sample_name = str_replace(.file_path, '.paf', '')
# read the .paf file into memory
mapping_results = vroom::vroom(
.file_path,
col_names = c(
c(
'qname', 'qlen', 'qstart', 'qend',
'strand', 'tname', 'tlen', 'tstart',
'tend', 'nmatch', 'alen', 'mapq',
'X13', 'X14', 'X15', 'X16', 'X17'
)
))
# select cols 1-12
mapping_results = select(mapping_results, 1:12)
# filter to keep only certain alignment results
mapping_results = mapping_results %>%
filter(
alen >= 0.5 * qlen, # the alignment length is at least 50% of the read length
nmatch >= 0.95 * alen # at least 95% of the aligned bases are a match
)
mapping_results = mapping_results %>%
group_by(tname) %>% # create groupped tibble; groupped by target sequences (genes)
summarize(n_reads_mapped = n()) # aggregate the tibble, summing up counts of mapped reads
# bind the counts of mapped reads to genes with all the other genes, counted zero times
mapping_results = mapping_results %>%
rename(gene_name = tname) %>%
bind_rows(
filter(empty_gene_counts_filler, !(gene_name %in% .$gene_name))
)
}
# process .paf files using parallel processing
processed_mapping_counts = future_map2(
list(empty_gene_counts_filler),
file_paths, ~
prepare_read_mappings(.x, .y)
)
# save the list of processed files
saveRDS(processed_mapping_counts, file = 'rdata/list_of_mapped_metaT_reads_from_each_sample.rds')
Finish processing the read-mapping results, calculate normalized Transcript Per Million (TPM) values for the final dataset
# create tibble that has all metatranscriptome read-mapping results joined together
processed_mapping_counts =
reduce(processed_mapping_counts, left_join, by = 'gene_name') %>%
rename_with( # rename the meteatranscriptome sample columns based on naming scheme used in the paper
~ sample_names,
.cols = 2:last_col()) %>%
mutate(gene_name = str_replace(
gene_name, '(MAG_\\d+)_(\\d+_\\d+)', '\\1-ctg\\2'))
#read in gene lengths data
gl = readRDS('/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/rdata/gene_lengths_tibble.rda')
gl = gl %>%
mutate(gene_length = gene_length / 1000) # convert unit length to kilobases
# join gene length data to read-mapping data
# divide each read-mapping count by the length of the gene that it mapped to
processed_mapping_counts = processed_mapping_counts %>%
left_join(gl, by = 'gene_name') %>%
relocate(gene_length, .after = 1) %>%
mutate(across(3:last_col(), ~ .x / gene_length))
# function to convert RNA-seq counts to TPM; TPM is normalized across and within samples
# the counts tibble has already had each cell divided by gene length
# this function will calculate the TPM scaling factor for each column and divide each cell in a column by the scaling factor, for all columns
get_tpm_for = function(col) {
scaling_factor = sum(col) / 1e6
col = col / scaling_factor
return(col)
}
# finalize the TPM normalization
processed_mapping_counts = processed_mapping_counts %>%
select(-gene_length) %>%
mutate(across(2:last_col(), ~ get_tpm_for(.x)))
# read in biosynthetic gene cluster (BGC) metadata, parsed from antiSMASH 6 output
bgc = vroom::vroom(
'/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/salmon_related_data_files/bgc_genes_from_10kb_clusters.txt',
show_col_types = FALSE)
bgc = bgc %>%
mutate(gene_name = str_replace(
gene_name,
'(^ctg\\d+_\\d+)-(MAG.*)', '\\2-\\1'
))
# filter the TPM-normalized read-mapping data to retain only biosynthetic gene cluster
# transcript mappings from biosynthetic clusters that were at least 10 kilobases in size
processed_mapping_counts = processed_mapping_counts %>%
filter(gene_name %in% bgc$gene_name)
Process the .paf output files, filter poor alignments, save the results
# load required libraries
library(tidyverse)
library(furrr)
# set working directory to the one with the .paf output files
setwd('/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/minimap2_metaG_mapping_final_results_used')
#set parallel processing params
plan(multicore, workers = 19)
#(125000 * 1024 ^2) %>% clipr::write_clip()
options(future.globals.maxSize = 1.31072e+11) #125000 * 1024 ^2
empty_gene_counts_filler = readRDS('rdata/metaG_empty_gene_counts_filler.rds')
# create vector of .paf full file paths
file_paths = system('ls *.paf', intern = TRUE)
sample_names = str_replace(file_paths, '.paf', '')
#function to process the .paf files
prepare_read_mappings = function(.empty_gene_counts_filler, .file_path) {
# extract metaT sample name from .file_path
.sample_name = str_replace(.file_path, '.paf', '')
# read the .paf file into memory
mapping_results = vroom::vroom(
.file_path,
col_names = c(
c(
'qname', 'qlen', 'qstart', 'qend',
'strand', 'tname', 'tlen', 'tstart',
'tend', 'nmatch', 'alen', 'mapq',
'X13', 'X14', 'X15', 'X16', 'X17'
)
))
# select cols 1-12
mapping_results = select(mapping_results, 1:12)
# filter to keep only certain alignment results
mapping_results = mapping_results |>
filter(
alen >= 0.5 * qlen, # the alignment length is at least 50% of the read length
nmatch >= 0.95 * alen # at least 95% of the aligned bases are a match
)
mapping_results = mapping_results |>
group_by(tname) |> # create groupped tibble; groupped by target sequences (genes)
summarize(n_reads_mapped = n()) # aggregate the tibble, summing up counts of mapped reads
# bind the counts of mapped reads to genes with all the other genes, counted zero times
mapping_results = mapping_results |>
rename(gene_name = tname) %>%
bind_rows(
filter(empty_gene_counts_filler, !(gene_name %in% .$gene_name))
)
}
# process .paf files using parallel processing
processed_mapping_counts = future_map2(
list(empty_gene_counts_filler),
file_paths, ~
prepare_read_mappings(.x, .y)
)
# save the list of processed files
saveRDS(processed_mapping_counts, file = 'rdata/list_of_mapped_metaG_reads_from_each_sample.rds')
Prepare gene expression matrix for DESeq2 differential expression analysis
# deseq2 ALL PA DEPTHS VS ALL FL DEPTHS run on minimap2 counts files -- 95% min percent identity of alignments at least 50% of read length aligned --------
# set working dir to the one containing .paf files from metatranscriptomic read-mapping to all cariaco MAG genes
setwd('/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/minimap2_metaT_mapping_final_results_used/')
files = system('ls *.paf', intern = TRUE)
sample_names = str_replace(files, '.paf', '')
# load list of tibbles - each tibble containing the read-mapping counts to genes from each metatranscriptomic sample
mapped_read_counts = readRDS('rdata/list_of_mapped_metaT_reads_from_each_sample.rds')
# join the tibbles together
mapped_read_counts =
reduce(mapped_read_counts, left_join, by = 'gene_name')
# rename samples to use sample names used in the paper
mapped_read_counts =
mapped_read_counts %>%
rename_with(
~ sample_names,
.cols = 2:last_col()
)
mapped_read_counts =
mapped_read_counts %>%
mutate(gene_name = str_replace(
gene_name, '(MAG_\\d+)_(\\d+_\\d+)', '\\1-ctg\\2')) %>%
as.data.frame() %>%
column_to_rownames(var = 'gene_name')
# create column metadata for use with the DESeq() function
col_data = tibble(sample = colnames(mapped_read_counts)) %>%
mutate(fraction = if_else(str_detect(sample, 'PA'), 'PA', 'FL'),
depth = str_replace(sample, '[:ALPHA:]{3}(\\d+)_\\d', '\\1'),
depth = as.integer(depth),
layer = case_when(depth == 900 ~ 'euxinic',
depth >= 103 & depth <= 237 ~ 'oxycline',
depth > 237 & depth < 900 ~ 'shallow.anoxic'),
group = paste0(fraction, '_', layer)) %>%
select(-c(fraction, depth, layer)) %>%
mutate(group = str_replace(group, '(.*)_.*', '\\1')) %>%
mutate(group = as_factor(group))
all(colnames(mapped_read_counts) == col_data$sample) # TRUE
# load a tibble containing the length in bp of each gene from all Cariaco MAGs
gl = readRDS('/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/rdata/gene_lengths_tibble.rda')
gl = gl %>%
mutate(gene_length = gene_length / 1000) # convert length unit to kilobases
logTransformed_mapping_tpm = mapped_read_counts %>%
rownames_to_column(var = 'gene_name') %>%
as_tibble() %>%
left_join(gl, by = 'gene_name') %>%
relocate(gene_length, .after = 1) %>%
mutate(across(3:last_col(), ~ .x / gene_length))
get_tpm_for = function(col) {
scaling_factor = sum(col) / 1e6
col = col / scaling_factor
return(col)
}
logTransformed_mapping_tpm = logTransformed_mapping_tpm %>%
select(-gene_length) %>%
mutate(across(2:last_col(), ~ get_tpm_for(.x))) %>%
mutate(across(2:last_col(), ~ log1p(.x)))
bgc = vroom::vroom(
'/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/salmon_related_data_files/bgc_genes_from_10kb_clusters.txt',
show_col_types = FALSE)
bgc = bgc %>%
mutate(gene_name = str_replace(
gene_name,
'(^ctg\\d+_\\d+)-(MAG.*)', '\\2-\\1'
))
logTransformed_mapping_tpm = logTransformed_mapping_tpm %>%
filter(gene_name %in% bgc$gene_name)
da_dds <- DESeq2::DESeqDataSetFromMatrix(mapped_read_counts,
colData = col_data,
design = ~ group)
register(MulticoreParam(40))
da_final_dds <- DESeq2::DESeq(da_dds, parallel = TRUE) # run the DESeq2 statistical model which assumes a negative binomial
# pull Wald test results comparing the oxycline PA to FL fractions
res <- DESeq2::results(da_final_dds,
contrast = c('group',
'PA',
'FL'),
alpha = 0.05)
final_res <- res %>%
as.data.frame() %>%
rownames_to_column(var = 'mag_name') %>%
as_tibble() %>%
mutate(enrichment = case_when(
log2FoldChange > 0 & padj < 0.05 ~ 'particle enriched',
log2FoldChange < 0 & padj < 0.05 ~ 'free-living enriched',
padj > 0.05 | is.na(padj) ~ 'no significant difference'))
col_metadata = tibble(sample = colnames(mapped_read_counts)[
2:ncol(mapped_read_counts)
]) %>%
mutate(fraction = if_else(str_detect(sample, 'PA'), 'PA', 'FL'),
depth = str_replace(sample, '[:ALPHA:]{3}(\\d+)_\\d', '\\1'),
depth = as.integer(depth),
layer = case_when(depth == 900 ~ 'euxinic',
depth >= 103 & depth <= 237 ~ 'oxycline',
depth > 237 & depth < 900 ~ 'shallow.anoxic'))
mapped_read_counts = mapped_read_counts %>%
rownames_to_column(var = 'gene_name') %>%
as_tibble()
cs = mapped_read_counts %>%
mutate(mag_name = str_replace(gene_name, '(MAG_\\d+)-.*', '\\1'), .before = 1)
cs = cs %>%
mutate(
fl_counts = rowSums(select(., col_metadata %>%
filter(fraction == 'FL') %>%
pull(sample))),
pa_counts = rowSums(select(., col_metadata %>%
filter(fraction == 'PA') %>%
pull(sample)))
) %>%
relocate(c(fl_counts, pa_counts), .after = gene_name) %>%
select(1:8)
#bgc = readRDS('/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/rdata/bgc_kb_5000_gene_data.rda')
bgc_uniq = bgc %>%
select(mag_name, gene_name) %>%
distinct() %>%
mutate(is_biosynthetic = TRUE)
sig_genes = res %>%
as.data.frame() %>%
rownames_to_column(var = 'gene_name') %>%
as_tibble() %>%
left_join(bgc_uniq %>%
select(-mag_name),
by = 'gene_name')
#pa_sig
sig_genes %>%
filter(is_biosynthetic, log2FoldChange > 0, padj <= 0.05) %>%
mutate(is_sig_pa_biosynthetic = TRUE) %>%
select(gene_name, is_sig_pa_biosynthetic)
#fl_sig
sig_genes %>%
filter(is_biosynthetic, log2FoldChange < 0, padj <= 0.05) %>%
mutate(is_sig_fl_biosynthetic = TRUE) %>%
select(gene_name, is_sig_fl_biosynthetic)
clust_analysis = pmap_df(list(
list(sig_genes),
list(sig_genes %>%
filter(is_biosynthetic, log2FoldChange > 0, padj <= 0.05) %>%
mutate(is_sig_pa_biosynthetic = TRUE) %>%
select(gene_name, is_sig_pa_biosynthetic)),
list(sig_genes %>%
filter(is_biosynthetic, log2FoldChange < 0, padj <= 0.05) %>%
mutate(is_sig_fl_biosynthetic = TRUE) %>%
select(gene_name, is_sig_fl_biosynthetic)),
list(
cs %>% select(2:4)
)
), ~
..1 %>%
left_join(..2, by = 'gene_name') %>%
left_join(..3, by = 'gene_name') %>%
left_join(..4, by = 'gene_name')
)
clust_data_pa = clust_analysis %>%
filter(is_sig_pa_biosynthetic |
(pa_counts > 0 & fl_counts == 0 & is_biosynthetic)
) %>%
relocate(c(fl_counts, pa_counts, padj,
is_sig_pa_biosynthetic,
is_sig_fl_biosynthetic,
is_biosynthetic), .after = gene_name) %>%
left_join(bgc, by = 'gene_name')
clust_data_fl = clust_analysis %>%
filter(is_sig_fl_biosynthetic |
(fl_counts > 0 & pa_counts == 0 & is_biosynthetic)
) %>%
relocate(c(fl_counts, pa_counts, padj,
is_sig_pa_biosynthetic,
is_sig_fl_biosynthetic,
is_biosynthetic), .after = gene_name) %>%
left_join(bgc, by = 'gene_name')
bgc_summary = bgc %>%
group_by(contig, region) %>%
summarize(n = n(), .groups = 'drop')
pa_expressed = clust_data_pa %>%
left_join(bgc_summary, by = c('contig', 'region')) %>%
group_by(contig, region) %>%
rename(cluster_size = n) %>%
add_tally() %>%
ungroup() %>%
mutate(proportion_expressed = n / cluster_size)
pa_expressed_summary = pa_expressed %>%
select(product, proportion_expressed,
contig, region, cluster_size, n) %>%
distinct()
fl_expressed = clust_data_fl %>%
left_join(bgc_summary, by = c('contig', 'region')) %>%
group_by(contig, region) %>%
rename(cluster_size = n) %>%
add_tally() %>%
ungroup() %>%
mutate(proportion_expressed = n / cluster_size)
fl_expressed_summary = fl_expressed %>%
select(product, proportion_expressed,
contig, region, cluster_size, n) %>%
distinct()
####
map_chr(ls(), ~ paste0(.x, ': ', object.size(get(.x)) %>% format(units = "Mb")))
final_de_and_clust_data = pa_expressed %>%
bind_rows(fl_expressed)
test_final_de = readRDS('~/Downloads/final_de_and_uniq_expr_clust_data.rds')
#PA
test_final_de %>%
filter(
(padj < 0.05 & log2FoldChange > 0) |
(pa_counts > 0 & fl_counts == 0)) %>%
left_join(
mag_table %>%
select(mag_name, phylum, class),
by = 'mag_name') %>%
pull(phylum) %>%
table() %>%
sort()
#FL
test_final_de %>%
filter(
(padj < 0.05 & log2FoldChange < 0) |
(fl_counts > 0 & pa_counts == 0)) %>%
left_join(
mag_table %>%
select(mag_name, phylum, class),
by = 'mag_name') %>%
pull(phylum) %>%
table() %>%
sort()
test_final_de %>%
filter(pa_counts > 0 & fl_counts == 0) %>%
nrow()
test_final_de %>%
filter(fl_counts > 0 & pa_counts == 0) %>%
nrow()
clust_gene_summary = readRDS('~/Documents/cariaco-tidy-analysis/cariaco_github_data_small/rdata/clust_gene_summary.rds')
allDaResults = allDaResults %>%
map_dfr(~ .x)
allDaResults = allDaResults %>%
group_by(mag_name) %>%
mutate(strict_fraction_preference = case_when(
all(preference == 'particle enriched') ~ 'PA',
all(preference == 'free-living enriched') ~ 'FL',
TRUE ~ 'no strict preference'), .after = 1) %>%
ungroup() %>%
arrange(phylum, mag_name)
strict_pa_mags = allDaResults %>%
filter(strict_fraction_preference == 'PA') %>%
pull(mag_name) %>%
unique()
strict_fl_mags = allDaResults %>%
filter(strict_fraction_preference == 'FL') %>%
pull(mag_name) %>%
unique()
# fl transporter summary tibble
fl_transporter_summary =
clust_gene_summary %>%
filter(mag_name %in% strict_fl_mags) %>%
left_join(mag_table %>%
select(mag_name, phylum),
by = 'mag_name') %>%
group_by(contig, product) %>%
mutate(contains_transporter = if_else(str_detect(description, 'transport') | str_detect(gene_functions, 'transport'), TRUE, FALSE),
.after = product) %>%
mutate(contains_transporter = if_else(is.na(contains_transporter), FALSE, contains_transporter)) %>%
select(mag_name, phylum, contig, product, contains_transporter) %>%
mutate(temp = if_else(any(contains_transporter), TRUE, FALSE)) %>%
select(-contains_transporter) %>%
ungroup() %>%
rename(contains_transporter = temp) %>%
distinct()
# pa transporter summary tibble
pa_transporter_summary =
clust_gene_summary %>%
filter(mag_name %in% strict_pa_mags) %>%
left_join(mag_table %>%
select(mag_name, phylum),
by = 'mag_name') %>%
group_by(contig, product) %>%
mutate(contains_transporter = if_else(str_detect(description, 'transport') | str_detect(gene_functions, 'transport'), TRUE, FALSE),
.after = product) %>%
mutate(contains_transporter = if_else(is.na(contains_transporter), FALSE, contains_transporter)) %>%
select(mag_name, phylum, contig, product, contains_transporter) %>%
mutate(temp = if_else(any(contains_transporter), TRUE, FALSE)) %>%
select(-contains_transporter) %>%
ungroup() %>%
rename(contains_transporter = temp) %>%
distinct()
###
pa_transporter_summary %>%
group_by(phylum) %>%
summarize(
has_transporter = sum(contains_transporter),
no_transporter = sum(!contains_transporter)
)
###
fl_transporter_summary %>%
group_by(phylum) %>%
summarize(
has_transporter = sum(contains_transporter),
no_transporter = sum(!contains_transporter)) %>%
print(n = 30)
## # A tibble: 24 × 3
## phylum has_transporter no_transporter
## <chr> <int> <int>
## 1 AABM5-125-24 0 4
## 2 Acidobacteriota 3 19
## 3 Aenigmarchaeota 1 0
## 4 Altiarchaeota 1 0
## 5 Bacteroidota 10 2
## 6 Campylobacterota 1 0
## 7 Chloroflexota 6 20
## 8 Cyanobacteria 2 0
## 9 Desulfobacterota 11 27
## 10 Gemmatimonadota 2 4
## 11 Iainarchaeota 0 1
## 12 Latescibacterota 8 14
## 13 Marinisomatota 5 2
## 14 Nanoarchaeota 10 13
## 15 Nitrospinota 1 5
## 16 Omnitrophota 50 58
## 17 Patescibacteria 1 0
## 18 Planctomycetota 9 13
## 19 Proteobacteria 52 49
## 20 SAR324 7 15
## 21 Spirochaetota 0 1
## 22 Thermoplasmatota 1 0
## 23 UAP2 0 1
## 24 UBA10199 6 4
#####
logTransformed_tpm_for_bgc_genes_minimap2_res = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/logTransformed_tpm_for_bgc_genes_minimap2_res.rds')
# temp_FL_rowAnno = temp_FL_rowAnno %>%
# mutate(temp = as.numeric(contains_transporter))
bgc_genes_10kb =
bgc_genes_10kb %>%
mutate(gene_name = str_replace(gene_name, '(ctg.*_\\d+)-(MAG_\\d+)', '\\2-\\1'))
# tpm of each gene from clusters >= 10kb in size -- FL
temp_FL_rowAnno = logTransformed_tpm_for_bgc_genes_minimap2_res %>%
select(gene_name, contains('FL')) %>%
mutate(row_sums = rowSums(.[2:ncol(.)])) %>%
left_join(
bgc_genes_10kb %>%
left_join(fl_transporter_summary %>%
select(contig, product, contains_transporter),
by = c('contig', 'product')) %>%
filter(!is.na(contains_transporter)) %>%
select(-c(cluster_kb, contig_edge)),
by = 'gene_name') %>%
filter(!is.na(contains_transporter)) %>%
filter(row_sums > 0) %>%
arrange(contains_transporter) %>%
column_to_rownames(var = 'gene_name') %>%
select(contains_transporter) %>%
mutate(contains_transporter = as.numeric(contains_transporter))
temp_sum_FL_rowAnno = logTransformed_tpm_for_bgc_genes_minimap2_res %>%
select(gene_name, contains('FL')) %>%
mutate(row_sums = rowSums(.[2:ncol(.)])) %>%
left_join(
bgc_genes_10kb %>%
left_join(fl_transporter_summary %>%
select(contig, product, contains_transporter),
by = c('contig', 'product')) %>%
filter(!is.na(contains_transporter)) %>%
select(-c(cluster_kb, contig_edge)),
by = 'gene_name') %>%
filter(!is.na(contains_transporter)) %>%
filter(row_sums > 0) %>%
select(-row_sums) %>%
group_by(mag_name, contig, region, product, contains_transporter) %>%
summarize(across(where(is.double), ~ sum(.x)), .groups = 'drop') %>%
mutate(temp = paste0(contig, ';', product), .before = 1) %>%
arrange(contains_transporter) %>%
column_to_rownames(var = 'temp') %>%
select(contains_transporter) %>%
mutate(contains_transporter = as.numeric(contains_transporter))
temp_sum_FL_rowAnno
temp_sum_PA_rowAnno = logTransformed_tpm_for_bgc_genes_minimap2_res %>%
select(gene_name, contains('PA')) %>%
mutate(row_sums = rowSums(.[2:ncol(.)])) %>%
left_join(
bgc_genes_10kb %>%
left_join(pa_transporter_summary %>%
select(contig, product, contains_transporter),
by = c('contig', 'product')) %>%
filter(!is.na(contains_transporter)) %>%
select(-c(cluster_kb, contig_edge)),
by = 'gene_name') %>%
filter(!is.na(contains_transporter)) %>%
filter(row_sums > 0) %>%
select(-row_sums) %>%
group_by(mag_name, contig, region, product, contains_transporter) %>%
summarize(across(where(is.double), ~ sum(.x)), .groups = 'drop') %>%
mutate(temp = paste0(contig, ';', product), .before = 1) %>%
arrange(contains_transporter) %>%
column_to_rownames(var = 'temp') %>%
select(contains_transporter) %>%
mutate(contains_transporter = as.numeric(contains_transporter))
# analysis by water layer -------------------------------------------------
# list of MAGs with FL fraction preference, one list per water layer
fl_fract_prefs_list = list(
oxycline = allDaResults %>%
filter(preference == 'free-living enriched', layer == 'oxycline', padj <= 0.01),
shallow_anoxic = allDaResults %>%
filter(preference == 'free-living enriched', layer == 'shallow_anoxic', padj <= 0.01),
euxinic = allDaResults %>%
filter(preference == 'free-living enriched', layer == 'euxinic', padj <= 0.01)
)
#View(fl_fract_prefs_list)
# list of MAGs with PA fraction preference, one list per water layer
pa_fract_prefs_list = list(
oxycline = allDaResults %>%
filter(preference == 'particle enriched', layer == 'oxycline', padj <= 0.01),
shallow_anoxic = allDaResults %>%
filter(preference == 'particle enriched', layer == 'shallow_anoxic', padj <= 0.01),
euxinic = allDaResults %>%
filter(preference == 'particle enriched', layer == 'euxinic', padj <= 0.01)
)
##
fl_transporter_list = fl_fract_prefs_list %>%
map(~
clust_gene_summary %>%
filter(mag_name %in% .x$mag_name) %>%
left_join(mag_table %>%
select(mag_name, phylum),
by = 'mag_name') %>%
group_by(contig, product) %>%
mutate(contains_transporter = if_else(str_detect(description, 'transport') | str_detect(gene_functions, 'transport'), TRUE, FALSE),
.after = product) %>%
mutate(contains_transporter = if_else(is.na(contains_transporter), FALSE, contains_transporter)) %>%
select(mag_name, phylum, contig, product, contains_transporter) %>%
mutate(temp = if_else(any(contains_transporter), TRUE, FALSE)) %>%
select(-contains_transporter) %>%
ungroup() %>%
rename(contains_transporter = temp) %>%
distinct()
)
##
pa_transporter_list = pa_fract_prefs_list %>%
map(~
clust_gene_summary %>%
filter(mag_name %in% .x$mag_name) %>%
left_join(mag_table %>%
select(mag_name, phylum),
by = 'mag_name') %>%
group_by(contig, product) %>%
mutate(contains_transporter = if_else(str_detect(description, 'transport') | str_detect(gene_functions, 'transport'), TRUE, FALSE),
.after = product) %>%
mutate(contains_transporter = if_else(is.na(contains_transporter), FALSE, contains_transporter)) %>%
select(mag_name, phylum, contig, product, contains_transporter) %>%
mutate(temp = if_else(any(contains_transporter), TRUE, FALSE)) %>%
select(-contains_transporter) %>%
ungroup() %>%
rename(contains_transporter = temp) %>%
distinct()
)
# create color specifications for column annotations
anno_colors = list(
Fraction = c(
'PA' = 'darkgreen',
'FL' = 'darkseagreen2'),
Layer = c(
'Oxycline' = 'darkslategray3', #'wheat2',
'Shallow anoxic' = 'gold',# 'khaki2',
'Euxinic' = 'sienna3'),
Season = c('May' = 'slategray',
'November' = 'slategray1'),
`Depth (m)` = c(
'103' = 'grey90',
'148' = 'grey85',
'198' = 'grey80',
'200' = 'grey75',
'234' = 'grey70',
'237' = 'grey65',
'247' = 'grey60',
'267' = 'grey55',
'295' = 'grey50',
'314' = 'grey45',
'900' = 'grey20'
)
)
fl_prefer_expr_profile_oxy_data =
logTransformed_tpm_for_bgc_genes_minimap2_res %>%
select(gene_name, matches(paste0('.*', c(103,148,198,200,234,237), '.*'))) %>%
mutate(across(2:last_col(), ~ expm1(.x))) %>%
mutate(row_sums = rowSums(.[2:ncol(.)])) %>%
left_join(
bgc_genes_10kb %>%
left_join(fl_transporter_list$oxycline %>%
select(contig, product, contains_transporter),
by = c('contig', 'product')) %>%
filter(!is.na(contains_transporter)) %>%
select(-c(cluster_kb, contig_edge)),
by = 'gene_name') %>%
filter(!is.na(contains_transporter)) %>%
filter(row_sums > 0) %>%
filter(!mag_name %in% c('MAG_1507', 'MAG_769', 'MAG_983')) %>% #
group_by(mag_name, contig, region, product, contains_transporter) %>%
summarize(across(where(is.double), ~ sum(.x)), .groups = 'drop') %>%
mutate(across(where(is.double), ~ log1p(.x))) %>%
mutate(temp = paste0(contig, ';', product), .before = 1) %>%
arrange(contains_transporter, row_sums) %>%
column_to_rownames(var = 'temp') %>%
select(-c(mag_name, contig, region, product,
contains_transporter, row_sums)) %>%
relocate(contains('PA')) %>%
as.matrix()
# create column annotation information object
oxy_fl_col_anno = tibble(
sample_name = colnames(fl_prefer_expr_profile_oxy_data),
Depth = str_replace(sample_name, '.*(\\d{3}).*', '\\1'),
Fraction = if_else(str_detect(sample_name, 'PA'), 'PA', 'FL'),
Season = if_else(str_detect(sample_name, 'M'), 'May', 'November')
) %>%
mutate(Depth = as.integer(Depth),
Layer = case_when(
Depth < 247 ~ 'Oxycline',
Depth >= 247 & Depth < 900 ~ 'Shallow anoxic',
TRUE ~ 'Euxinic'
)) %>%
mutate(Depth = as.character(Depth)) %>%
rename(`Depth (m)` = Depth) %>%
column_to_rownames(var = 'sample_name') %>%
select(-c(`Depth (m)`, Layer))
fl_prefer_expr_profile_oxy_plot = fl_prefer_expr_profile_oxy_data %>%
ComplexHeatmap::pheatmap(
name = 'Transcripts per million',
gaps_col = 12,
cluster_cols = FALSE,
show_rownames = FALSE,
annotation_col = oxy_fl_col_anno,
annotation_colors = anno_colors,
treeheight_row = 0,
treeheight_col = 0,
clustering_method = 'ward.D2',
color =
c(colorRampPalette(colors = 'white')(1),
colorRampPalette(colors = c(
'#ebf4f7',
'#aae6fa',
'#f7e96d',
'#f7e43b',
'orange',
'#ff6a00',
'firebrick1',
'darkorchid3',
'purple4'))(3000)),
legend_breaks = seq(0, 6, by = 2),
legend_labels = gt_render(c(
'0',
'6.39 x 10^0',
'5.36 x 10^1',
'4.02 x 10^2')),
show_row_dend = FALSE,
show_column_dend = FALSE
)
pa_prefer_expr_profile_oxy_data =
logTransformed_tpm_for_bgc_genes_minimap2_res %>%
select(gene_name, matches(paste0('.*', c(103,148,198,200,234,237), '.*'))) %>%
mutate(across(2:last_col(), ~ expm1(.x))) %>%
mutate(row_sums = rowSums(.[2:ncol(.)])) %>%
left_join(
bgc_genes_10kb %>%
left_join(pa_transporter_list$oxycline %>%
select(contig, product, contains_transporter),
by = c('contig', 'product')) %>%
filter(!is.na(contains_transporter)) %>%
select(-c(cluster_kb, contig_edge)),
by = 'gene_name') %>%
filter(!is.na(contains_transporter)) %>%
filter(row_sums > 0) %>%
filter(!mag_name %in% c('MAG_1507', 'MAG_769', 'MAG_983')) %>% #
group_by(mag_name, contig, region, product, contains_transporter) %>%
summarize(across(where(is.double), ~ sum(.x)), .groups = 'drop') %>%
#
mutate(across(where(is.double), ~ log1p(.x))) %>%
#
mutate(temp = paste0(contig, ';', product), .before = 1) %>%
arrange(contains_transporter, row_sums) %>%
column_to_rownames(var = 'temp') %>%
select(-c(mag_name, contig, region, product,
contains_transporter, row_sums)) %>%
relocate(contains('PA')) %>%
as.matrix()
# create column annotation information object
oxy_pa_col_anno = tibble(
sample_name = colnames(pa_prefer_expr_profile_oxy_data),
Depth = str_replace(sample_name, '.*(\\d{3}).*', '\\1'),
Fraction = if_else(str_detect(sample_name, 'PA'), 'PA', 'FL'),
Season = if_else(str_detect(sample_name, 'M'), 'May', 'November')
) %>%
mutate(Depth = as.integer(Depth),
Layer = case_when(
Depth < 247 ~ 'Oxycline',
Depth >= 247 & Depth < 900 ~ 'Shallow anoxic',
TRUE ~ 'Euxinic'
)) %>%
mutate(Depth = as.character(Depth)) %>%
rename(`Depth (m)` = Depth) %>%
column_to_rownames(var = 'sample_name') %>%
select(-c(`Depth (m)`, Layer))
pa_prefer_expr_profile_oxy_plot = pa_prefer_expr_profile_oxy_data %>%
ComplexHeatmap::pheatmap(
name = 'Transcripts per million',
gaps_col = 12,
cluster_cols = FALSE,
show_rownames = FALSE,
annotation_col = oxy_pa_col_anno,
annotation_colors = anno_colors,
treeheight_row = 0,
treeheight_col = 0,
clustering_method = 'ward.D2',
color =
c(colorRampPalette(colors = 'white')(1),
colorRampPalette(colors = c(
'#ebf4f7',
'#aae6fa',
'#f7e96d',
'#f7e43b',
'orange',
'#ff6a00',
'firebrick1',
'darkorchid3',
'purple4'))(3000)
),
legend_breaks = 0:4,
legend_labels = gt_render(c(
'0',
'1.72',
'6.39',
'19.09',
'53.60')),
show_row_dend = FALSE,
show_column_dend = FALSE
)
oxycline_frac_pref_expr_plot = plot_grid(
NULL,
grid.grabExpr(draw(
pa_prefer_expr_profile_oxy_plot,
merge_legend = TRUE,
heatmap_legend_side = 'right',
annotation_legend_side = 'right')),
NULL,
grid.grabExpr(draw(
fl_prefer_expr_profile_oxy_plot,
merge_legend = TRUE,
heatmap_legend_side = 'right',
annotation_legend_side = 'right')),
nrow = 1,
rel_widths = c(0.1, 1, 0.1, 1),
labels = c('PA','', 'FL', '')
)
# bgc expr profiles of fl- and pa- pref mags in -- SHALLOW ANOXIC ---------------
fl_prefer_expr_profile_anox_data =
logTransformed_tpm_for_bgc_genes_minimap2_res %>%
select(gene_name, matches(paste0('.*', c(247,267,295,314), '.*'))) %>%
mutate(across(2:last_col(), ~ expm1(.x))) %>%
mutate(row_sums = rowSums(.[2:ncol(.)])) %>%
left_join(bgc_genes_10kb %>%
left_join(fl_transporter_list$shallow_anoxic %>%
select(contig, product, contains_transporter),
by = c('contig', 'product')) %>%
filter(!is.na(contains_transporter)) %>%
select(-c(cluster_kb, contig_edge)),
by = 'gene_name') %>%
filter(!is.na(contains_transporter)) %>%
filter(row_sums > 0) %>%
filter(!mag_name %in% c('MAG_1507', 'MAG_769', 'MAG_983')) %>% #
group_by(mag_name, contig, region, product, contains_transporter) %>%
summarize(across(where(is.double), ~ sum(.x)), .groups = 'drop') %>%
#
mutate(across(where(is.double), ~ log1p(.x))) %>%
#
mutate(temp = paste0(contig, ';', product), .before = 1) %>%
arrange(contains_transporter, row_sums) %>%
column_to_rownames(var = 'temp') %>%
select(-c(mag_name, contig, region, product,
row_sums, contains_transporter)) %>%
relocate(colnames(.) %>%
{function(x) {
tbl = tibble(
mpa = x[str_detect(x, 'MPA')],
npa = x[str_detect(x, 'NPA')],
mfl = x[str_detect(x, 'MFL')],
nfl = x[str_detect(x, 'NFL')]
) %>%
mutate(groupper = rep(paste0('group_', seq(1, 2, by = 1)), each = 2)) %>%
group_by(groupper) %>%
summarize(across(everything(), ~
paste0(.x, collapse = ';')),
.groups = 'drop') %>%
select(-groupper)
pa = as.vector(rbind(tbl$mpa, tbl$npa))
fl = as.vector(rbind(tbl$mfl, tbl$nfl))
g = c(pa, fl)
g = str_split(g, pattern = ';') %>% unlist()
return(g)}}()) %>%
as.matrix()
# create column annotation information object
anox_fl_col_anno = tibble(
sample_name = colnames(fl_prefer_expr_profile_anox_data),
Depth = str_replace(sample_name, '.*(\\d{3}).*', '\\1'),
Fraction = if_else(str_detect(sample_name, 'PA'), 'PA', 'FL'),
Season = if_else(str_detect(sample_name, 'M'), 'May', 'November')
) %>%
mutate(Depth = as.integer(Depth),
Layer = case_when(
Depth < 247 ~ 'Oxycline',
Depth >= 247 & Depth < 900 ~ 'Shallow anoxic',
TRUE ~ 'Euxinic'
)) %>%
mutate(Depth = as.character(Depth)) %>%
rename(`Depth (m)` = Depth) %>%
column_to_rownames(var = 'sample_name') %>%
select(-c(`Depth (m)`, Layer))
fl_prefer_expr_profile_anox_plot = fl_prefer_expr_profile_anox_data %>%
ComplexHeatmap::pheatmap(
name = 'Transcripts per million',
gaps_col = 8,
cluster_cols = FALSE,
show_rownames = FALSE,
annotation_col = anox_fl_col_anno,
annotation_colors = anno_colors,
treeheight_row = 0,
treeheight_col = 0,
clustering_method = 'ward.D2',
color =
c(colorRampPalette(colors = 'white')(1),
colorRampPalette(colors = c(
'#ebf4f7',
'#aae6fa',
'#f7e96d',
'#f7e43b',
'orange',
'#ff6a00',
'firebrick1',
'darkorchid3',
'purple4'))(3000)),
legend_breaks = seq(0, 6, by = 2),
legend_labels = gt_render(c(
'0',
'6.39 x 10^0',
'5.36 x 10^1',
'4.02 x 10^2')),
show_row_dend = FALSE,
show_column_dend = FALSE
)
pa_prefer_expr_profile_anox_data =
logTransformed_tpm_for_bgc_genes_minimap2_res %>%
select(gene_name, matches(paste0('.*', c(247,267,295,314), '.*'))) %>%
mutate(across(2:last_col(), ~ expm1(.x))) %>%
mutate(row_sums = rowSums(.[2:ncol(.)])) %>%
left_join(bgc_genes_10kb %>%
left_join(pa_transporter_list$shallow_anoxic %>%
select(contig, product, contains_transporter),
by = c('contig', 'product')) %>%
filter(!is.na(contains_transporter)) %>%
select(-c(cluster_kb, contig_edge)),
by = 'gene_name') %>%
filter(!is.na(contains_transporter)) %>%
filter(row_sums > 0) %>%
filter(!mag_name %in% c('MAG_1507', 'MAG_769', 'MAG_983')) %>% #
group_by(mag_name, contig, region, product, contains_transporter) %>%
summarize(across(where(is.double), ~ sum(.x)), .groups = 'drop') %>%
#
mutate(across(where(is.double), ~ log1p(.x))) %>%
#
mutate(temp = paste0(contig, ';', product), .before = 1) %>%
arrange(row_sums, contains_transporter) %>%
column_to_rownames(var = 'temp') %>%
select(-c(mag_name, contig, region, product,
row_sums, contains_transporter)) %>%
relocate(colnames(.) %>%
{function(x) {
tbl = tibble(
mpa = x[str_detect(x, 'MPA')],
npa = x[str_detect(x, 'NPA')],
mfl = x[str_detect(x, 'MFL')],
nfl = x[str_detect(x, 'NFL')]
) %>%
mutate(groupper = rep(paste0('group_', seq(1, 2, by = 1)), each = 2)) %>%
group_by(groupper) %>%
summarize(across(everything(), ~
paste0(.x, collapse = ';')),
.groups = 'drop') %>%
select(-groupper)
pa = as.vector(rbind(tbl$mpa, tbl$npa))
fl = as.vector(rbind(tbl$mfl, tbl$nfl))
g = c(pa, fl)
g = str_split(g, pattern = ';') %>% unlist()
return(g)}}()) %>%
as.matrix()
# create column annotation information object
anox_pa_col_anno = tibble(
sample_name = colnames(pa_prefer_expr_profile_anox_data),
Depth = str_replace(sample_name, '.*(\\d{3}).*', '\\1'),
Fraction = if_else(str_detect(sample_name, 'PA'), 'PA', 'FL'),
Season = if_else(str_detect(sample_name, 'M'), 'May', 'November')
) %>%
mutate(Depth = as.integer(Depth),
Layer = case_when(
Depth < 247 ~ 'Oxycline',
Depth >= 247 & Depth < 900 ~ 'Shallow anoxic',
TRUE ~ 'Euxinic'
)) %>%
mutate(Depth = as.character(Depth)) %>%
rename(`Depth (m)` = Depth) %>%
column_to_rownames(var = 'sample_name') %>%
select(-c(`Depth (m)`, Layer))
pa_prefer_expr_profile_anox_plot = pa_prefer_expr_profile_anox_data %>%
ComplexHeatmap::pheatmap(
name = 'Transcripts per million',
gaps_col = 8,
cluster_cols = FALSE,
show_rownames = FALSE,
annotation_col = anox_pa_col_anno,
annotation_colors = anno_colors,
treeheight_row = 0,
treeheight_col = 0,
clustering_method = 'ward.D2',
color =
c(colorRampPalette(colors = 'white')(1),
colorRampPalette(colors = c(
'#ebf4f7',
'#aae6fa',
'#f7e96d',
'#f7e43b',
'orange',
'#ff6a00',
'firebrick1',
'darkorchid3',
'purple4'))(3000)
),
legend_breaks = 0:5,
legend_labels = gt_render(c(
'0',
'1.72 x 10^0',
'6.39 x 10^0',
'1.91 x 10^1',
'5.36 x 10^1',
'1.47 x 10^2')),
show_row_dend = FALSE,
show_column_dend = FALSE
)
anoxic_frac_pref_expr_plot = plot_grid(
NULL,
grid.grabExpr(draw(
pa_prefer_expr_profile_anox_plot,
merge_legend = TRUE,
heatmap_legend_side = 'right',
annotation_legend_side = 'right')),
NULL,
grid.grabExpr(draw(
fl_prefer_expr_profile_anox_plot,
merge_legend = TRUE,
heatmap_legend_side = 'right',
annotation_legend_side = 'right')),
nrow = 1,
rel_widths = c(0.1, 1, 0.1, 1),
labels = c('PA','', 'FL', '')
)
# bgc expr profiles of fl- and pa- pref mags in -- EUXINIC ---------------
fl_prefer_expr_profile_eux_data =
logTransformed_tpm_for_bgc_genes_minimap2_res %>%
select(gene_name, matches(paste0('.*', 900, '.*'))) %>%
mutate(across(2:last_col(), ~ expm1(.x))) %>%
mutate(row_sums = rowSums(.[2:ncol(.)])) %>%
left_join(bgc_genes_10kb %>%
left_join(fl_transporter_list$euxinic %>%
select(contig, product, contains_transporter),
by = c('contig', 'product')) %>%
filter(!is.na(contains_transporter)) %>%
select(-c(cluster_kb, contig_edge)),
by = 'gene_name') %>%
filter(!is.na(contains_transporter)) %>%
filter(row_sums > 0) %>%
filter(!mag_name %in% c('MAG_1507', 'MAG_769', 'MAG_983')) %>% #
group_by(mag_name, contig, region, product, contains_transporter) %>%
summarize(across(where(is.double), ~ sum(.x)), .groups = 'drop') %>%
#
mutate(across(where(is.double), ~ log1p(.x))) %>%
#
mutate(temp = paste0(contig, ';', product), .before = 1) %>%
arrange(contains_transporter, row_sums) %>%
column_to_rownames(var = 'temp') %>%
select(-c(mag_name, contig, region, product,
row_sums, contains_transporter)) %>%
relocate(colnames(.) %>%
{function(x) {
tbl = tibble(
mpa = x[str_detect(x, 'MPA')],
npa = x[str_detect(x, 'NPA')],
mfl = x[str_detect(x, 'MFL')],
nfl = x[str_detect(x, 'NFL')]
) %>%
mutate(groupper = rep(paste0('group_', seq(1, 1, by = 1)), each = 2)) %>%
group_by(groupper) %>%
summarize(across(everything(), ~
paste0(.x, collapse = ';')),
.groups = 'drop') %>%
select(-groupper)
pa = as.vector(rbind(tbl$mpa, tbl$npa))
fl = as.vector(rbind(tbl$mfl, tbl$nfl))
g = c(pa, fl)
g = str_split(g, pattern = ';') %>% unlist()
return(g)}}()) %>%
as.matrix()
# create column annotation information object
eux_fl_col_anno = tibble(
sample_name = colnames(fl_prefer_expr_profile_eux_data),
Depth = str_replace(sample_name, '.*(\\d{3}).*', '\\1'),
Fraction = if_else(str_detect(sample_name, 'PA'), 'PA', 'FL'),
Season = if_else(str_detect(sample_name, 'M'), 'May', 'November')
) %>%
mutate(Depth = as.integer(Depth),
Layer = case_when(
Depth < 247 ~ 'Oxycline',
Depth >= 247 & Depth < 900 ~ 'Shallow anoxic',
TRUE ~ 'Euxinic'
)) %>%
mutate(Depth = as.character(Depth)) %>%
rename(`Depth (m)` = Depth) %>%
column_to_rownames(var = 'sample_name') %>%
select(-c(`Depth (m)`, Layer))
fl_prefer_expr_profile_eux_plot = fl_prefer_expr_profile_eux_data %>%
ComplexHeatmap::pheatmap(
name = 'Transcripts per million',
gaps_col = 4,
cluster_cols = FALSE,
show_rownames = FALSE,
annotation_col = eux_fl_col_anno,
annotation_colors = anno_colors,
treeheight_row = 0,
treeheight_col = 0,
clustering_method = 'ward.D2',
color =
c(colorRampPalette(colors = 'white')(1),
colorRampPalette(colors = c(
'#ebf4f7',
'#aae6fa',
'#f7e96d',
'#f7e43b',
'orange',
'#ff6a00',
'firebrick1',
'darkorchid3',
'purple4'))(3000)),
legend_breaks = seq(0, 6, by = 2),
legend_labels = gt_render(c(
'0',
'6.39 x 10^0',
'5.36 x 10^1',
'4.02 x 10^2')),
show_row_dend = FALSE,
show_column_dend = FALSE
)
pa_prefer_expr_profile_eux_data =
logTransformed_tpm_for_bgc_genes_minimap2_res %>%
select(gene_name, matches(paste0('.*', 900, '.*'))) %>%
mutate(across(2:last_col(), ~ expm1(.x))) %>%
mutate(row_sums = rowSums(.[2:ncol(.)])) %>%
left_join(bgc_genes_10kb %>%
left_join(pa_transporter_list$euxinic %>%
select(contig, product, contains_transporter),
by = c('contig', 'product')) %>%
filter(!is.na(contains_transporter)) %>%
select(-c(cluster_kb, contig_edge)),
by = 'gene_name') %>%
filter(!is.na(contains_transporter)) %>%
filter(row_sums > 0) %>%
filter(!mag_name %in% c('MAG_1507', 'MAG_769', 'MAG_983')) %>% #
group_by(mag_name, contig, region, product, contains_transporter) %>%
summarize(across(where(is.double), ~ sum(.x)), .groups = 'drop') %>%
#
mutate(across(where(is.double), ~ log1p(.x))) %>%
#
mutate(temp = paste0(contig, ';', product), .before = 1) %>%
arrange(row_sums, contains_transporter) %>%
column_to_rownames(var = 'temp') %>%
select(-c(mag_name, contig, region, product,
row_sums, contains_transporter)) %>%
relocate(colnames(.) %>%
{function(x) {
tbl = tibble(
mpa = x[str_detect(x, 'MPA')],
npa = x[str_detect(x, 'NPA')],
mfl = x[str_detect(x, 'MFL')],
nfl = x[str_detect(x, 'NFL')]
) %>%
mutate(groupper = rep(paste0('group_', seq(1, 1, by = 1)), each = 2)) %>%
group_by(groupper) %>%
summarize(across(everything(), ~
paste0(.x, collapse = ';')),
.groups = 'drop') %>%
select(-groupper)
pa = as.vector(rbind(tbl$mpa, tbl$npa))
fl = as.vector(rbind(tbl$mfl, tbl$nfl))
g = c(pa, fl)
g = str_split(g, pattern = ';') %>% unlist()
return(g)}}()) %>%
as.matrix()
# create column annotation information object
eux_pa_col_anno = tibble(
sample_name = colnames(pa_prefer_expr_profile_eux_data),
Depth = str_replace(sample_name, '.*(\\d{3}).*', '\\1'),
Fraction = if_else(str_detect(sample_name, 'PA'), 'PA', 'FL'),
Season = if_else(str_detect(sample_name, 'M'), 'May', 'November')
) %>%
mutate(Depth = as.integer(Depth),
Layer = case_when(
Depth < 247 ~ 'Oxycline',
Depth >= 247 & Depth < 900 ~ 'Shallow anoxic',
TRUE ~ 'Euxinic'
)) %>%
mutate(Depth = as.character(Depth)) %>%
rename(`Depth (m)` = Depth) %>%
column_to_rownames(var = 'sample_name') %>%
select(-c(`Depth (m)`, Layer))
pa_prefer_expr_profile_eux_plot = pa_prefer_expr_profile_eux_data %>%
ComplexHeatmap::pheatmap(
name = 'Transcripts per million',
gaps_col = 8,
cluster_cols = FALSE,
show_rownames = FALSE,
annotation_col = eux_pa_col_anno,
annotation_colors = anno_colors,
treeheight_row = 0,
treeheight_col = 0,
clustering_method = 'ward.D2',
color =
c(colorRampPalette(colors = 'white')(1),
colorRampPalette(colors = c(
'#ebf4f7',
'#aae6fa',
'#f7e96d',
'#f7e43b',
'orange',
'#ff6a00',
'firebrick1',
'darkorchid3',
'purple4'))(3000)
),
legend_breaks = 0:3,
legend_labels = gt_render(c(
'0',
'1.72 x 10^0',
'6.39 x 10^0',
'1.91 x 10^1')),
show_row_dend = FALSE,
show_column_dend = FALSE
)
euxinic_frac_pref_expr_plot = plot_grid(
NULL,
grid.grabExpr(draw(
pa_prefer_expr_profile_eux_plot,
merge_legend = TRUE,
heatmap_legend_side = 'right',
annotation_legend_side = 'right')),
NULL,
grid.grabExpr(draw(
fl_prefer_expr_profile_eux_plot,
merge_legend = TRUE,
heatmap_legend_side = 'right',
annotation_legend_side = 'right')),
nrow = 1,
rel_widths = c(0.1, 1, 0.1, 1),
labels = c('PA','', 'FL', '')
)
all_bgc_expr_frac_pref_plots = plot_grid(
NULL,
oxycline_frac_pref_expr_plot,
NULL,
anoxic_frac_pref_expr_plot,
NULL,
euxinic_frac_pref_expr_plot,
ncol = 2,
nrow = 3,
labels = c('a', '', 'b', '', 'c', ''),
rel_widths = c(0.05, 1)
)
all_bgc_expr_frac_pref_plots
annotated as ladderanes
logTransformed_tpm_for_bgc_genes_minimap2_res = readRDS('~/Documents/cariaco-tidy-analysis/rdata-files/logTransformed_tpm_for_bgc_genes_minimap2_res.rds')
ladderane_colAnno = readRDS('~/Documents/cariaco-tidy-analysis/cariaco_github_data_small/rdata/ladderane_colAnno.rds')
allDaResults_df = allDaResults %>%
map_dfr(~ .x) %>%
group_by(mag_name) %>%
mutate(strict_fraction_preference = case_when(
all(preference == 'free-living enriched') ~ 'FL',
all(preference == 'particle enriched') ~ 'PA',
TRUE ~ 'No strict preference')) %>%
ungroup()
ladderane_rowAnno = logTransformed_tpm_for_bgc_genes_minimap2_res %>%
left_join(
bgc_genes_10kb %>%
select(gene_name, product, contig),
by = 'gene_name') %>%
relocate(c(product, contig), .after = 1) %>%
filter(product == 'ladderane') %>%
mutate(row_sums = rowSums(.[4:ncol(.)]), .after = contig) %>%
filter(row_sums > 0) %>%
select(-c(gene_name, row_sums)) %>%
group_by(contig, product) %>%
summarize(across(everything(), ~ sum(.x)), .groups = 'drop') %>%
mutate(temp = paste0(contig, ';', product),
mag_name = str_replace(contig, '(MAG_\\d+)_.*', '\\1')) %>%
select(temp, mag_name) %>%
left_join(
mag_table %>%
select(mag_name, phylum),
by = 'mag_name') %>%
left_join(
allDaResults_df %>%
select(mag_name, strict_fraction_preference) %>%
distinct(),
by = 'mag_name') %>%
select(-mag_name) %>%
column_to_rownames(var = 'temp') %>%
rename(
Phylum = phylum,
`Fraction preference` = strict_fraction_preference)
ladderane_relPalette <- list(
Layer = c(
'Oxycline' = 'darkslategray3', #'wheat2',
'Shallow anoxic' = 'gold',# 'khaki2',
'Euxinic' = 'sienna3'),
Fraction = c('PA' = 'darkgreen',
'FL' = 'darkseagreen2'),
Season = c('May' = 'slategray',
'November' = 'slategray1'),
`Depth (m)` = c(
'103' = 'grey90',
'148' = 'grey85',
'198' = 'grey80',
'200' = 'grey75',
'234' = 'grey70',
'237' = 'grey65',
'247' = 'grey60',
'267' = 'grey55',
'295' = 'grey50',
'314' = 'grey45',
'900' = 'grey20'),
Phylum = c('Planctomycetota' = 'khaki',
'Desulfobacterota' = '#666666',
'Myxococcota' = 'orange',
'Fibrobacterota' = 'orchid1',
'GCA-001730085' = 'slateblue1',
'Latescibacterota' = 'aquamarine',
'Omnitrophota' = 'darkviolet',
'Verrucomicrobiota' = 'gainsboro',
'UBP3' = 'deepskyblue'),
`Fraction preference` = c('FL' = 'grey70',
'PA' = 'black',
'No strict preference' = 'white'))
ladderane_heat_data =
logTransformed_tpm_for_bgc_genes_minimap2_res %>%
left_join(
bgc_genes_10kb %>%
select(gene_name, product, contig),
by = 'gene_name') %>%
relocate(c(product, contig), .after = 1) %>%
filter(product == 'ladderane') %>%
mutate(row_sums = rowSums(.[4:ncol(.)]), .after = contig) %>%
filter(row_sums > 0) %>%
select(-c(gene_name, row_sums)) %>%
mutate(across(where(is.numeric), ~ expm1(.x))) %>%
group_by(contig, product) %>%
summarize(across(everything(), ~ sum(.x)), .groups = 'drop') %>%
mutate(across(where(is.numeric), ~ log1p(.x))) %>%
mutate(temp = paste0(contig, ';', product)) %>%
mutate(mag_name = str_replace(contig, '(MAG_\\d+)_.*', '\\1')) %>%
left_join(
mag_table %>%
select(mag_name, phylum),
by = 'mag_name') %>%
group_by(phylum) %>%
mutate(n = n()) %>%
ungroup() %>%
column_to_rownames(var = 'temp') %>%
arrange(desc(n), phylum) %>%
select(-c(contig, product, phylum, mag_name, n)) %>%
relocate(colnames(.) %>%
{function(x) {
tbl = tibble(
mpa = x[str_detect(x, 'MPA')],
npa = x[str_detect(x, 'NPA')],
mfl = x[str_detect(x, 'MFL')],
nfl = x[str_detect(x, 'NFL')]
) %>%
mutate(groupper = rep(paste0('group_', seq(1, 6, by = 1)), each = 2)) %>%
group_by(groupper) %>%
summarize(across(everything(), ~
paste0(.x, collapse = ';')),
.groups = 'drop') %>%
select(-groupper)
pa = as.vector(rbind(tbl$mpa, tbl$npa))
fl = as.vector(rbind(tbl$mfl, tbl$nfl))
g = c(fl, pa)
g = str_split(g, pattern = ';') %>% unlist()
return(g)}}()
) %>%
as.matrix()
ladderane_colAnno = ladderane_colAnno %>%
rownames_to_column(var = 'sample') %>%
as_tibble() %>%
arrange(factor(sample, levels = colnames(ladderane_heat_data))) %>%
column_to_rownames(var = 'sample')
ladderane_rowAnno = ladderane_rowAnno %>%
rownames_to_column(var = 'label') %>%
as_tibble() %>%
arrange(factor(label, levels = rownames(ladderane_heat_data))) %>%
column_to_rownames(var = 'label')
ladderane_heat_plot = ladderane_heat_data %>%
ComplexHeatmap::pheatmap(
name = 'Transcripts per million',
legend_breaks = 0:4,
legend_labels = gt_render(c(
'0',
'1.72 x 10^0',
'6.40 x 10^0',
'1.91 x 10^1',
'5.36 x 10^1')),
fontsize = 8,
cluster_cols = FALSE,
cluster_rows = FALSE,
show_rownames = FALSE,
annotation_row = ladderane_rowAnno,
annotation_col = ladderane_colAnno,
annotation_colors = ladderane_relPalette,
treeheight_row = 0,
treeheight_col = 0,
clustering_method = 'ward.D2',
color =
c(colorRampPalette(colors = 'white')(1),
colorRampPalette(colors = c(
#'white',
#'gray100',
#'gray100',
#'gray99',
#'gray98',
'#ebf4f7',
'#aae6fa',
#'#c9e6f0',
'#f7e96d',
#'yellow',
'#f7e43b',
#'goldenrod1',
'orange',
'#ff6a00',
'firebrick1',
#'#D73027',
'darkorchid3',
#'purple3',
'purple4'))(3000)))
ladderane_heat_plot_final = ComplexHeatmap::draw(
ladderane_heat_plot,
merge_legend = TRUE,
heatmap_legend_side = 'right'
#annotation_legend_side = 'right'
)
#!/bin/bash
#SBATCH --partition=computer # Queue selection
#SBATCH --job-name=antismash6_array # Job name
#SBATCH --mail-type=ALL # Mail events (BEGIN, END, FAIL, ALL)
#SBATCH --mail-user=your_email_address@gmail.com # Where to send mail
#SBATCH --ntasks=36 # Run on 36 CPUs
#SBATCH --nodes=1 # one node
#SBATCH --cpus-per-task=1 # one CPU per parallel task
#SBATCH --mem=180gb # Job memory request
#SBATCH --time=1-00:00:00 # Time limit hrs:min:sec
#SBATCH --output=logs/final_antismash_array_%A-%a.log# Standard output/error
#SBATCH --array= # fill this in with numbers uniquely affiliated with each of your MAGs
#SBATCH --qos=unlim
#
source activate anti6 # antismash 6.0 conda environment
GENOME_DIR=$(echo "path/to/intput/genome_fasta_files")
GBK_DIR=$(echo "path/to/output/genome_gene_prediction_gbk_files")
GFF_DIR=$(echo "path/to/output/genome/genome_gene_prediction_gff_files/files")
FASTA_DIR=$(echo "path/to/output/genome_gene_prediction_fasta_files")
OUT_DIR=$(echo "path/to/antismash/output/dir")
source activate gbk
cd "$GBK_DIR"
for FASTA in "$GENOME_DIR"/MAG_"$SLURM_ARRAY_TASK_ID".fa
do MAG=$(echo $FASTA | sed -r 's/.*(MAG_.*).fa/\1/')
#create gene predictions in FASTA format - used for rnaseq read alignment later
prodigal \
-i $FASTA \
-q \
-d "$FASTA_DIR"/"$MAG"-genes.fa
#create gene predictions in GFF format - to create GBK file antismash input
prodigal \
-i $FASTA \
-q \
-f gff \
-o "$GFF_DIR"/"$MAG".gff
#create GBK file that will be used as antismash input -- using gff_to_genbank.py script
python path/to/gff_to_genbank.py \
"$GFF_DIR"/"$MAG".gff $FASTA
mv "$GFF_DIR"/"$MAG".gb "$MAG".gbk
conda deactivate # deactivate conda env
source activate anti6 # activate antismash 6 conda env
antismash "$GBK_DIR"/"$MAG".gbk \
--output-dir "$OUT_DIR"/"$MAG" \
-c 36 \
--clusterhmmer \
--tigrfam \
--smcog-trees \
--cb-general \
--cb-subclusters \
--pfam2go \
--rre \
--cc-mibig \
--asf \
--allow-long-headers \
--genefinding-tool none \
--output-basename "$MAG"
done
#!/bin/bash
#SBATCH --partition=compute # Queue selection
#SBATCH --job-name=minimap2_metaG # Job name
#SBATCH --mail-type=ALL # Mail events (BEGIN, END, FAIL, ALL)
#SBATCH --mail-user=your_email_address # Where to send mail
#SBATCH --ntasks=36 # Run on 36 CPUs
#SBATCH --nodes=1
#SBATCH --cpus-per-task=1
#SBATCH --mem=180gb # Job memory request
#SBATCH --time=1-00:00:00 # Time limit hrs:min:sec
#SBATCH --output=logs/minimap2_metaG_mapping_over_cariaco_genes_array_%A-%a.log# Standard output/error
#SBATCH --array=0-47
#SBATCH --qos=unlim
source activate minimap2
cd /vortexfs1/omics/pachiadaki/Cariaco-data/Metagenomes
FILE_PATTERN=("D1b900AM" "D1b900BM" "D2a143A" "D2a143B" "D2a200A" "D2a200B" "D2a237A" "D2a237B" "D2a247A" "D2a247B" "D2a267A" "D2a267B" "D2a900AN" "D2a900BN" "D2b143A" "D2b143B" "D2b200A" "D2b200B" "D2b237A" "D2b237B" "D2b247A" "D2b247B" "D2b267A" "D2b267B" "D2b900AN" "D2b900BN" "D3a103A" "D3a103B" "D3a198A" "D3a198B" "D3a234A" "D3a234B" "D3a295A" "D3a295B" "D3a314A" "D3a314B" "D3a900AM" "D3a900BM" "D3b103A" "D3b103B" "D3b198A" "D3b198B" "D3b234A" "D3b234B" "D3b295A" "D3b295B" "D3b314A" "D3b314B")
FILE_PATTERN=${FILE_PATTERN[$SLURM_ARRAY_TASK_ID]}
REF_DIR=$(echo "/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/FINAL_MAY_2022_ALL_ANTISMASH6_CONCAT_GENE_FASTA_INDEX")
OUT_DIR=$(echo "/vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/minimap2_metaG_mapping_final_results_used")
minimap2 -t 36 -ax sr "$REF_DIR"/cariaco_minimap2_ref.mmi "$FILE_PATTERN"_R1_paired.fastq.gz "$FILE_PATTERN"_R2_paired.fastq.gz > "$OUT_DIR"/"$FILE_PATTERN".sam
cd "$OUT_DIR"
paftools.js sam2paf "$FILE_PATTERN".sam > "$FILE_PATTERN".paf
#!/bin/bash
#SBATCH --partition=compute # Queue selection
#SBATCH --job-name=minimap2_metaT # Job name
#SBATCH --mail-type=ALL # Mail events (BEGIN, END, FAIL, ALL)
#SBATCH --mail-user=your_email_address # Where to send mail
#SBATCH --ntasks=36 # Run on 36 CPUs
#SBATCH --nodes=1
#SBATCH --cpus-per-task=1
#SBATCH --mem=180gb # Job memory request
#SBATCH --time=1-00:00:00 # Time limit hrs:min:sec
#SBATCH --output=logs/minimap2_metaT_mapping_over_cariaco_genes_array_%A-%a.log# Standard output/error
#SBATCH --array=0-47
#SBATCH --qos=unlim
source activate minimap2
cd /vortexfs1/omics/pachiadaki/dgellermcgrath/all-files-from-scratch-2-8-2021/cariaco_metatranscriptomes
FILE_PATTERN=("MFL103_1" "MFL103_2" "MFL198_1" "MFL198_2" "MFL234_1" "MFL234_2" "MFL295_1" "MFL295_2" "MFL314_1" "MFL314_2" "MFL900_1" "MFL900_2" "MPA103_1" "MPA103_2" "MPA198_1" "MPA198_2" "MPA234_1" "MPA234_2" "MPA295_1" "MPA295_2" "MPA314_1" "MPA314_2" "MPA900_1" "MPA900_2" "NFL148_1" "NFL148_2" "NFL200_1" "NFL200_2" "NFL237_1" "NFL237_2" "NFL247_1" "NFL247_2" "NFL267_1" "NFL267_2" "NFL900_1" "NFL900_2" "NPA148_1" "NPA148_2" "NPA200_1" "NPA200_2" "NPA237_1" "NPA237_2" "NPA247_1" "NPA247_2" "NPA267_1" "NPA267_2" "NPA900_1" "NPA900_2")
FILE_PATTERN=${FILE_PATTERN[$SLURM_ARRAY_TASK_ID]}
minimap2 -t 36 -ax sr /vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/FINAL_MAY_2022_ALL_ANTISMASH6_CONCAT_GENE_FASTA_INDEX/cariaco_minimap2_ref.mmi "$FILE_PATTERN"_R1_paired.fastq.gz "$FILE_PATTERN"_R2_paired.fastq.gz > /vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/minimap2_metaT_mapping_final_results_used/"$FILE_PATTERN".sam
cd /vortexfs1/omics/pachiadaki/dgellermcgrath/cariaco-work-051121/antismash6_cariaco_rdata/FINAL_PRODIGAL_AND_ANTISMASH_OUTPUTS_012422/minimap2_metaT_mapping_final_results_used/
paftools.js sam2paf "$FILE_PATTERN".sam > "$FILE_PATTERN".paf
conda activate bigscape # conda env to run BiG-SCAPE
INPUT_GBK_DIR=$(echo "path/to/antismash6/BGC/gbk/files")
PFAM_DIR=$(echo "path/to/downloaded/pfam/directory")
python bigscape.py \
-i "$INPUT_GBK_DIR" \
-o cariaco_bigscape_results_FINAL \
-c 36 \
--pfam_dir "$PFAM_DIR" \
--mibig \
--clan_cutoff 0.3 0.7
#!/bin/bash
#SBATCH --partition=compute # Queue selection
#SBATCH --job-name=coverM_genomeCounts # Job name
#SBATCH --mail-type=ALL # Mail events (BEGIN, END, FAIL, ALL)
#SBATCH --mail-user=your_email_address # Where to send mail
#SBATCH --ntasks=36 # Run on one CPU
#SBATCH --nodes=1
#SBATCH --cpus-per-task=1
#SBATCH --mem=180gb # Job memory request
#SBATCH --time=1-00:00:00 # Time limit hrs:min:sec
#SBATCH --output=logs/coverm_mag_counts_of_reads_mapped_array_%A-%a.log# Standard output/error
#SBATCH --array=0-47
#SBATCH --qos=unlim
source activate coverm
cd /vortexfs1/omics/pachiadaki/Cariaco-data/Metagenomes
GENOME_DIR=$(echo "path/to/genome/fasta/dir/")
OUT_DIR=$(echo "path/to/desired/output/dir")
FILE_PATTERN=("D1b900AM" "D1b900BM" "D2a143A" "D2a143B" "D2a200A" "D2a200B" "D2a237A" "D2a237B" "D2a247A" "D2a247B" "D2a267A" "D2a267B" "D2a900AN" "D2a900BN" "D2b143A" "D2b143B" "D2b200A" "D2b200B" "D2b237A" "D2b237B" "D2b247A" "D2b247B" "D2b267A" "D2b267B" "D2b900AN" "D2b900BN" "D3a103A" "D3a103B" "D3a198A" "D3a198B" "D3a234A" "D3a234B" "D3a295A" "D3a295B" "D3a314A" "D3a314B" "D3a900AM" "D3a900BM" "D3b103A" "D3b103B" "D3b198A" "D3b198B" "D3b234A" "D3b234B" "D3b295A" "D3b295B" "D3b314A" "D3b314B")
FILE_PATTERN=${FILE_PATTERN[$SLURM_ARRAY_TASK_ID]}
coverm genome \
--coupled "$FILE_PATTERN"_R1_paired.fastq.gz "$FILE_PATTERN"_R2_paired.fastq.gz \
--genome-fasta-directory "$GENOME_DIR" \
--genome-fasta-extension fa \
--mapper bwa-mem \
--methods count \
--min-covered-fraction 0 \
--min-read-percent-identity 95 \
--min-read-aligned-percent 50 \
--exclude-supplementary \
--threads 36 \
-o "$OUT_DIR"/"$FILE_PATTERN"_counts_reads_mapped_per_genome.tsv
#!/bin/bash
#SBATCH --partition=compute # Queue selection
#SBATCH --job-name=coverM_relAbun # Job name
#SBATCH --mail-type=ALL # Mail events (BEGIN, END, FAIL, ALL)
#SBATCH --mail-user=your_email_address # Where to send mail
#SBATCH --ntasks=36 # Run on one CPU
#SBATCH --nodes=1
#SBATCH --cpus-per-task=1
#SBATCH --mem=180gb # Job memory request
#SBATCH --time=1-00:00:00 # Time limit hrs:min:sec
#SBATCH --output=logs/coverm_mag_counts_of_reads_mapped_array_%A-%a.log# Standard output/error
#SBATCH --array=0-47
#SBATCH --qos=unlim
source activate coverm
cd /vortexfs1/omics/pachiadaki/Cariaco-data/Metagenomes
GENOME_DIR=$(echo "path/to/genome/fasta/dir/")
OUT_DIR=$(echo "path/to/desired/output/dir")
FILE_PATTERN=("D1b900AM" "D1b900BM" "D2a143A" "D2a143B" "D2a200A" "D2a200B" "D2a237A" "D2a237B" "D2a247A" "D2a247B" "D2a267A" "D2a267B" "D2a900AN" "D2a900BN" "D2b143A" "D2b143B" "D2b200A" "D2b200B" "D2b237A" "D2b237B" "D2b247A" "D2b247B" "D2b267A" "D2b267B" "D2b900AN" "D2b900BN" "D3a103A" "D3a103B" "D3a198A" "D3a198B" "D3a234A" "D3a234B" "D3a295A" "D3a295B" "D3a314A" "D3a314B" "D3a900AM" "D3a900BM" "D3b103A" "D3b103B" "D3b198A" "D3b198B" "D3b234A" "D3b234B" "D3b295A" "D3b295B" "D3b314A" "D3b314B")
FILE_PATTERN=${FILE_PATTERN[$SLURM_ARRAY_TASK_ID]}
coverm genome \
--coupled "$FILE_PATTERN"_R1_paired.fastq.gz "$FILE_PATTERN"_R2_paired.fastq.gz \
--genome-fasta-directory "$GENOME_DIR" \
--genome-fasta-extension fa \
--mapper bwa-mem \
--methods relative_abundance \
--min-read-percent-identity 95 \
--min-read-aligned-percent 50 \
--exclude-supplementary \
--threads 36 \
-o "$OUT_DIR"/"$FILE_PATTERN"_minReadAlignedPercent95.tsv
conda activate gunc # activate gunc conda environment
LADDERANE_DIR=$(echo "path/to/genomes/containing/predicted/ladderane/clusters/fasta/files/dir/")
GUNC_DB=$(echo "path/to/GUNC/db")
OUT_DIR=$(echo "path/to/desired/output/dir/")
for MAG in "$LADDERANE_DIR"
do gunc run \
-i "$MAG" \
-r "$GUNC_DB" \
--output-dir "$OUT_DIR"/"$MAG" \
--threads 36